In [4]:
%matplotlib inline

import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import os
import numpy as np

In [5]:
if ( sys.version_info.major == 3 ):
    from functools import reduce

Module 0: Reading Tweets

The first thing we do is read in tweets from a directory of compressed files. Our collection of compressed tweets are in the data_files/twitter directory, so we'll use pattern matching (called "globbing") to find all the tweet files in the given directory.

Then, for each file, we'll open it, read each line (which is a tweet in JSON form), and build an object out of it. As part of this process, we will extract each tweet's post time and create a map from minute timestamps to the tweets posted during that minute.


In [3]:
tweetPath = os.path.join("data_files", "twitter")
tweetFiles = {
   "time01": os.path.join(tweetPath, "statuses.*.gz")
}

frequencyMap = {}
globalTweetCounter = 0

timeFormat = "%a %b %d %H:%M:%S +0000 %Y"

reader = codecs.getreader("utf-8")

for (key, path) in tweetFiles.items():
    localTweetList = []
    for filePath in glob.glob(path):
        print ("Reading File:", filePath)
        
        for line in gzip.open(filePath, 'rb'):

            # Try to read tweet JSON into object
            tweetObj = None
            try:
                tweetObj = json.loads(reader.decode(line)[0])
            except Exception as e:
                continue

            # Deleted status messages and protected status must be skipped
            if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
                continue

            # Try to extract the time of the tweet
            try:
                currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
            except:
                print (line)
                raise

            currentTime = currentTime.replace(second=0)
            
            # Increment tweet count
            globalTweetCounter += 1
            
            # If our frequency map already has this time, use it, otherwise add
            if ( currentTime in frequencyMap.keys() ):
                timeMap = frequencyMap[currentTime]
                timeMap["count"] += 1
                timeMap["list"].append(tweetObj)
            else:
                frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}

# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime

timeIntervalStep = datetime.timedelta(0, 60)    # Time step in seconds
while ( thisTime <= lastTime ):
    if ( thisTime not in frequencyMap.keys() ):
        frequencyMap[thisTime] = {"count":0, "list":[]}
        
    thisTime = thisTime + timeIntervalStep

print ("Processed Tweet Count:", globalTweetCounter)


Reading File: data_files/twitter/statuses.log.2014-08-13-16.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-17.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-18.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-19.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-20.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-21.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-22.gz
Reading File: data_files/twitter/statuses.log.2014-08-13-23.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-00.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-01.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-02.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-03.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-04.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-05.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-06.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-07.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-08.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-09.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-10.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-11.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-12.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-13.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-14.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-15.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-16.gz
Reading File: data_files/twitter/statuses.log.2014-08-14-17.gz
Processed Tweet Count: 293560

Module 1: Simple Frequency Analysis

In this section, we will cover a few simple analysis techniques to garner some small insights rapidly.

  • Twitter Timeline
  • Top Twitter Users
  • Twitter API
  • Posting Frequency Distribution
  • Popular Hashtags
  • Simple Event Detection
  • Language Distributions

Twitter Timeline

To build a timeline of Twitter usage, we can simply plot the number of tweets posted per minute.


In [6]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Tweet Frequency")

# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())

# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])

# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

# We'll have ticks every thirty minutes (much more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

# Plot the post frequency
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()

plt.show()


Time Frame: 2014-08-13 16:00:00 2014-08-14 17:44:00

Top Twitter Users

Ferguson was a contentiuous topic, and many people had differing opinions about the issue. Given the volume of tweets we are analyzing, we can now answer who the "loudest" voices were during this time.

That is, who was tweeting the most during this particular time span?


In [7]:
# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}

# Iterate through the time stamps
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # For each tweet, pull the screen name and add it to the list
    for tweet in timeObj["list"]:
        user = tweet["user"]["screen_name"]
        
        if ( user not in globalUserCounter ):
            globalUserCounter[user] = 1
            globalUserMap[user] = [tweet]
        else:
            globalUserCounter[user] += 1
            globalUserMap[user].append(tweet)

print ("Unique Users:", len(globalUserCounter.keys()))


Unique Users: 171087

In [8]:
sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u], "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")


Top Ten Most Prolific Users:
miserablecitytv 94 
	 Random Tweet: RT @AntonioFrench: One pattern of the last few days: After police move the media out of the area, they become more heavy-handed and violent… 
----------
FreeGCF 80 
	 Random Tweet: RT @WesleyLowery: Ppl in #Ferguson v sensitive to media descriptions of rioting, so worth noting: only behavior accurately described as suc… 
----------
PLNoHope 70 
	 Random Tweet: RT @iAirDry: “@BlkSportsOnline: A story on why what is happening in #Ferguson is a daily fear for many African-Americans http://t.co/0BrHqb… 
----------
kingpin7666 66 
	 Random Tweet: RT @HalpernAlex: This is #Ferguson, a suburb in America. http://t.co/GfmHLo4u5q 
----------
desperate_jo13 60 
	 Random Tweet: RT @TuxcedoCat: #Ferguson Police Department:
Riot Gear ✔️
Tear Gas ✔️
Camouflage ✔️
Assault Rifles ✔️
Armored Land Mine Vehicles ✔️
Dashboa… 
----------
TxWomenRock 57 
	 Random Tweet: RT @SoulRevision: Yes &amp; they cover #Ferguson RT @jkendall82: @OwlsAsylum @SoulRevision  This is audio from St. Louis dispatch, not Ferguson… 
----------
No_Cut_Card 56 
	 Random Tweet: really? RT @kayquinn: #Ferguson police chief:  he was upset body of #MikeBrown laid in street so long after shooting. 
----------
sierramike320 55 
	 Random Tweet: RT @ChdRountree: America will prove its worth in how it responds to the #MikeBrown tragedy in #Ferguson @OpFerguson 
----------
Petapup1 55 
	 Random Tweet: RT @RiseCoffeeSTL: Calling all journalists covering #MikeBrown #Ferguson FREE coffee/Wifi if you need a place to work! @AntonioFrench http:… 
----------
TheAPJournalist 53 
	 Random Tweet: RT @jrosenbaum: Media scrum before #Ferguson Police chief conference: http://t.co/3Cwuv4xqPr 
----------

Twitter API

It's difficult to see who these people are, but we can go back to the Twitter API and get user descriptions for more information.


In [9]:
import tweepy

consumer_key = "RfWoIb9wocCY0kOYKUYnf5VOo"
consumer_secret = "FqsdZGdD4yvzwPj0yoe7lHRxgG4tjz2WVZbozxpOPnDunMhzv9"
access_token = "2421639553-0IF33x71RsEJL2aKCksu0C1VR8383nqRQK0dYSE"
access_token_secret = "3wSJCvLhgPBi8NUNVWbvosK2DAraGgB9K0NN0URNLVWjs"

# Set up the authorization mechanisms for Tweepy to access Twitter's API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [10]:
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u])

    # Get user info
    try:
        user = api.get_user(u)
        print ("\tDescription:", user.description)
    except Exception as te:
        print ("\tDescription Error:", te)
        
    print ("----------")


Top Ten Most Prolific Users:
miserablecitytv 94
	Description: Aggregator, Agitator & Amplifier - - - - - - - - - - - - - - - - - - - Striving for social justice is the most valuable thing to do in life. - Albert Einstein
----------
FreeGCF 80
	Description: @xmariedrewbtrx
----------
PLNoHope 70
	Description Error: Twitter error response: status code = 404
----------
kingpin7666 66
	Description: Veteran Of Three wars- Special Operations- #blacklivesmatter- The Future of the Democratic Party- Guilty of Being Black in A White America
----------
desperate_jo13 60
	Description: i am jo, and i am desperate for justice. O_O [pronouns: they/them/their]
----------
TxWomenRock 57
	Description: I block hellions, imps, & brutes  || IF WOMEN CHOOSE THEY CAN MAKE #WENDYDAVIS THE NEXT GOVERNOR OF TEXAS  || img by http://t.co/WGShienQMh
----------
No_Cut_Card 56
	Description: life enthusiast. disgruntled #Wizards fan.
----------
sierramike320 55
	Description: Advocate for the unheard and ignored. Interested in the earth, life, learning, love, and justice.  Baseball is Healing.
----------
Petapup1 55
	Description: Just me! #Prochoice, MARRIED, #atheist and a Diehard Capitals Fan! nsfw! #lgbtq
----------
TheAPJournalist 53
	Description: Co-executive and global news editor at @ReadByline. Word writer, straight talker, political junkie and data enthusiast. (Not affiliated with the @AP)
----------

Distribution of Postings

It appears a few users were posting to Twitter a lot. But how often did most Twitter users tweet during this time? We can build a histogram to see this distribution.


In [13]:
plt.figure(figsize=(16,8))
    
# the histogram of the data
plt.hist(
    [globalUserCounter[x] for x in globalUserCounter], 
    bins=100, 
    normed=0, 
    alpha=0.75,
    label="Counts",
    log=True)

plt.xlabel('Number of Tweets')
plt.ylabel('Counts')
plt.title("Histogram of Frequency")
plt.grid(True)
plt.legend()

plt.show()


Average Number of Posts


In [14]:
avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter])
print("Average Number of Posts:", avgPostCount)


Average Number of Posts: 1.71585216878

Hashtags give us a quick way to view the conversation and see what people are discussing. Getting the most popular hashtags is just as easy as getting the most prolific users.


In [15]:
# A map for hashtag counts
hashtagCounter = {}

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        hashtagList = tweet["entities"]["hashtags"]
        
        for hashtagObj in hashtagList:
            
            # We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
            hashtagString = hashtagObj["text"].lower()
            
            if ( hashtagString not in hashtagCounter ):
                hashtagCounter[hashtagString] = 1
            else:
                hashtagCounter[hashtagString] += 1

print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
    print ("\t", "#" + ht, hashtagCounter[ht])


Unique Hashtags: 6555
Top Twenty Hashtags:
	 #ferguson 209701
	 #mikebrown 17824
	 #mediablackout 5322
	 #gaza 4497
	 #michaelbrown 2541
	 #dontshoot 1968
	 #anonymous 1836
	 #stl 1607
	 #palestine 1542
	 #prayforferguson 1525
	 #justiceformikebrown 1322
	 #opferguson 1160
	 #myawhite 995
	 #usa 956
	 #policestate 906
	 #fergusonshooting 875
	 #tcot 805
	 #inners 773
	 #iraq 736
	 #fergusonriot 656

Event Detection w/ Keyword Frequency

Twitter is good for breaking news. When an impactful event occurs, we often see a spike on Twitter of the usage of a related keyword. Some examples are below.


In [18]:
# What keywords are we interested in?
targetKeywords = ["obama", "tear gas"]
# targetKeywords.append("lowery")
# targetKeywords.append("reilly")
targetKeywords.append("iraq")

# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # Temporary counter for this minute
    localTargetCounts = {x:0 for x in targetKeywords}
    localTotalCount = 0
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        localTotalCount += 1
        
        # Add to the counter if the target keyword is in this tweet
        for keyword in targetKeywords:
            if ( keyword in tweetString ):
                localTargetCounts[keyword] += 1
                
    # Add the counts for this minute to the main counter
    totalCount.append(localTotalCount)
    for keyword in targetKeywords:
        targetCounts[keyword].append(localTargetCounts[keyword])
        
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), totalCount, label="Total")

for keyword in targetKeywords:
    ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')

plt.show()


Language Distribution

The protests in Ferguson, MO became an international topic of discussion. As a result, people all over the world were tweeting about the events. Using Twitter's data, we can see how many people were tweeting in different languages.


In [19]:
# A map for counting each language
languageCounter = {}

for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        lang = tweet["lang"]
        
        if ( lang not in languageCounter ):
            languageCounter[lang] = 1
        else:
            languageCounter[lang] += 1

In [20]:
languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True)

for l in languages:
    print (l, languageCounter[l])


en 282138
es 3759
und 1882
de 1133
tr 795
fr 623
et 476
sk 463
tl 330
in 306
ar 282
it 241
pt 191
da 160
nl 140
ht 127
pl 81
sl 72
ja 62
sv 57
vi 54
no 51
th 29
ru 20
hu 16
is 14
fa 12
el 10
zh 10
lt 6
lv 5
fi 5
ko 4
hi 2
bg 2
iw 1
iu 1

In [21]:
plt.figure(figsize=(16,8))
    
# the histogram of the data
plt.bar(
    np.arange(len(languages)),
    [languageCounter[x] for x in languages],
    log=True)

plt.xticks(np.arange(len(languages)) + 0.5, languages)
plt.xlabel('Languages')
plt.ylabel('Counts (Log)')
plt.title("Language Frequency")
plt.grid(True)

plt.show()



Module 2: Geolocation in Twitter

Twitter allows users to share their GPS locations when tweeting, but only about 2% of tweets have this information. We can extract this geospatial data to look at patterns in different locations.

In this module, we will look at:

  • Filtering Twitter data with GPS data
  • Plotting GPS Data
  • Splitting data based on location
  • Top users inside/outside Ferguson
  • Top hashtags inside/outside Ferguson

Filtering GPS Data

Each tweet has a field called "coordinates" describing from where the tweet was posted. The field might be null if the tweet contains no location data, or it could contain bounding box information, place information, or GPS coordinates in the form of (longitude, latitude). We want tweets with this GPS data.

For more information on tweet JSON formats, check out https://dev.twitter.com/overview/api/tweets


In [22]:
# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0

# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
    geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"]))
    geoCount += len(geos)
    
    # Add to the timestamp map
    geoFrequencyMap[t] = {"count": len(geos), "list": geos}

print ("Number of Geo Tweets:", geoCount)


Number of Geo Tweets: 2000

GPS Frequency

What is the frequency of GPS-coded tweets?


In [23]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Geo Tweet Frequency")

postFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]

smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=45)

ax.plot(range(len(geoFrequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()

plt.show()


Plotting GPS Data

Now that we have a list of all the tweets with GPS coordinates, we can plot from where in the world these tweets were posted. To make this plot, we can leverage the Basemap package to make a map of the world and convert GPS coordinates to (x, y) coordinates we can then plot.


In [24]:
import matplotlib

from mpl_toolkits.basemap import Basemap

# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = reduce(lambda x, y: x + y, tmpGeoList)

# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]

# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'

fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
                   llcrnrlon=-180, urcrnrlon=180, resolution='l')

worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')

# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)

plt.show()


Filtering By Location

We can even use existing Geographic Information System (GIS) tools to determine from where a tweet was posted. For example, we could ask whether a particular tweet was posted from the United States.

To make this determination, we can use geocoding services like Google Maps, or we can use GIS data files called Shape Files, which contain geometric information for a variety of geographic entities (e.g., lakes, roads, county lines, states, countries, etc.).

For our purposes, we pulled a shape file containing the county borders for the state of Missouri, which were sourced from the US Census Department (http://www.census.gov/cgi-bin/geo/shapefiles2010/layers.cgi).

The first step then is to read in this shape file. To divide the Twitter data into those from inside Ferguson, MO and those outside, we found the county containing Ferguson, and we extract the shapes for that county.


In [25]:
# Create a new map to hold the shape file data
stLouisMap = Basemap(llcrnrlon=-130, llcrnrlat=22, urcrnrlon=-64,
                     urcrnrlat=52, projection='merc', lat_1=33, lat_2=45,
                     lon_0=-95, resolution='i', area_thresh=10000)

# Read in the shape file
moStateShapeFile = os.path.join("data_files", "moCountyShapes", "tl_2010_29_county10")
shp_info = stLouisMap.readshapefile(moStateShapeFile, 'states', drawbounds=True)

# Find only those polygons that describe St. Louis county
stLouisCountyPolygons = []
for (shapeDict, shape) in zip(stLouisMap.states_info, stLouisMap.states):
    if (shapeDict["NAME10"] == "St. Louis"):
        stLouisCountyPolygons.append(matplotlib.patches.Polygon(shape))
        
print ("Shape Count:", len(stLouisCountyPolygons))


Shape Count: 2

For each tweet, we can check whether its GPS coordinates came from St. Louis county or not.


In [26]:
# Maps of timestamps to tweets for inside/outside Ferguson
inStLouisFreqMap = {}
outStLouisFreqMap = {}

# For each geo-coded tweet, extract coordinates and conver them to the Basemap space
for t in sortedTimes:
    geos = geoFrequencyMap[t]["list"]
    convPoints = [(stLouisMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]

    # Local counters for this time
    inStLouisFreqMap[t] = {"count": 0, "list": []}
    outStLouisFreqMap[t] = {"count": 0, "list": []}
    
    # For each point, check if it is within St. Louis county or not
    for point in convPoints:
        x = point[0][0]
        y = point[0][1]

        inStLouisFlag = False

        for polygon in stLouisCountyPolygons:
            if ( polygon.contains_point((x, y)) ):
                inStLouisFreqMap[t]["list"].append(point[1])
                inStLouisFlag = True
                break

        if ( inStLouisFlag == False ):
            outStLouisFreqMap[t]["list"].append(point[1])

print ("Tweets in St. Louis:", np.sum([len(inStLouisFreqMap[t]["list"]) for t in sortedTimes]))
print ("Tweets outside St. Louis:", np.sum([len(outStLouisFreqMap[t]["list"]) for t in sortedTimes]))


Tweets in St. Louis: 100
Tweets outside St. Louis: 1900

Top Twitter Users Inside/Outside Ferguson

Now that we have divided the data based on those who were tweeting from within Ferguson, MO versus those who were outside, we can identify the most prolific users in each group.

Top Users Inside Ferguson


In [27]:
inStLouisTweets = reduce(lambda x, y: x + y, [inStLouisFreqMap[t]["list"] for t in sortedTimes])

userCounter = {}
userMap = {}

for tweet in inStLouisTweets:
    user = tweet["user"]["screen_name"]

    if ( user not in userCounter ):
        userCounter[user] = 1
        userMap[user] = [tweet]
    else:
        userCounter[user] += 1
        userMap[user].append(tweet)

print ("Unique Users in St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)


Unique Users in St. Louis: 79

In [28]:
print("Top Users in Ferguson:")
for u in sortedUsers[:10]:
    print (u, userCounter[u])
    
    # Get user info
    try:
        user = api.get_user(u)
        print ("\t", user.description)
    except Exception as te:
        print ("\t", te)
        
    print ("\t", userMap[u][0]["text"], "\n----------")


Top Users in Ferguson:
TheWidowJones 6
	 Instant gratification takes too long.
	 “@mattdpearce: Quick writeup of my call to the Ferguson police chief, notifying him of arrests: http://t.co/l3ugLhoZKy” jebus 
----------
jst_mani 3
	 Just a Mizzou Tiger trying to earn her stripes. When you believe in yourself, there's never any room for doubt. If you want something, go get it. Period.
	 http://t.co/4nTUmvrlAq 
----------
leisazigman 3
	 President of The Genome Partnership, a non profit working in the field of genomics
	 Just in: ACLU sends letter to #Ferguson pd concerned they asked vigils/protests just be held during day #MikeBrown 
----------
RoadRunnerSTL 3
	 Follow photojournalist Bobby Hughes as he hits the streets overnight and in the early morning for breaking news around St. Louis.
	 About 100 or more demonstrate outside Ferguson City Hall into the early morning hours . http://t.co/NbaB2YpzNs 
----------
CaseyNolen 3
	 @ksdknews Multimedia Journalist : @theninenetwork Host of #StayTunedSTL http://t.co/fx4hH3lN9N
	 STL Co Pros says "all evidence" will eventually be made public regardless of outcome of investigation. But not while ongoing. #Ferguson 
----------
michaelcalhoun 2
	 I tell stories for @KMOXNews and occasionally for @CBSRadioNews. Ke$ha told me once that she liked my beard. mrcalhoun@cbs.com
	 To restate: protestor with loudspeaker said that when media leave "y'all [in crowd] better watch out" because police will act. #Ferguson 
----------
jasonahuff 2
	 Husband. Dad. Fundraiser. Proud #STL native. Craft beer enthusiast. Tweets are my own.
	 Most major tv &amp; print news in #STL have btwn 20-58K followers. @AntonioFrench  has almost 38K. #CitizenJournalists #Ferguson 
----------
DochtuirRussell 2
	 Doctor of Pharmacy. Law Student at SLU. Veteran.
Cheering for Blue Jays, Cardinals, Blues and Habs.
	 Lovely destination #Ferguson seems http://t.co/gzuU8iAc3V 
----------
ShayMeinecke 2
	 #journalist
	 Peaceful protest on #Ferguson #vice.com @ Ferguson, MO http://t.co/1lXXkYGcg6 
----------
DonGallowayKSDK 2
	 Photojournalist, KSDK NewsChannel 5
	 Church of God &amp; Christ, Ferguson MO helping with the healing. #MichaelBrown #FergusonShooting #ksdknews http://t.co/MahrpsTr6d 
----------

Top Users Outside Ferguson


In [29]:
outStLouisTweets = reduce(lambda x, y: x + y, [outStLouisFreqMap[t]["list"] for t in sortedTimes])

userCounter = {}
userMap = {}

for tweet in outStLouisTweets:
    user = tweet["user"]["screen_name"]

    if ( user not in userCounter ):
        userCounter[user] = 1
        userMap[user] = [tweet]
    else:
        userCounter[user] += 1
        userMap[user].append(tweet)

print ("Unique Users outside St. Louis:", len(userCounter.keys()))
sortedUsers = sorted(userCounter, key=userCounter.get, reverse=True)


Unique Users outside St. Louis: 1689

In [30]:
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, userCounter[u])
    
    # Get user info
    try:
        user = api.get_user(u)
        print ("\t", user.description)
    except Exception as te:
        print ("\t", te)
        
    print ("\t", userMap[u][0]["text"], "\n----------")


Top Ten Most Prolific Users:
orlando_tina 8
	 retired model I love my dogs and hunting deer
	 Where is Obama now?# Ferguson 
----------
ipimi 8
	 Politically Progressive Boomer Feminist Geek Physician Writer...And~proud US Census member of The African Diaspora Mid-70sHowardZinnStudent@BU ♓️(3*17'54DCnw)
	 “@HuffPostPol: Ferguson police to meet with Michael Brown's mother http://t.co/t9obrJAayC”

#inners 
----------
Its_XADR 6
	 18. 12/11 Designer. Sniper. http://t.co/58xDj308tH @TahhDah
	 @Obey_Rebirthh just click this and see #Ferguson 
----------
sametaydoan4 5
	 unutur mu sevdiğini görmeyince göz
	 #Ferguson crowd threw rocks, bottles, Molotov cocktails, saw a gun, right before this stepped off http://t.co/uKrO1L7kj9 
----------
poetarmone 5
	 Twitter error response: status code = 404
	 #ferguson
I am pissed at all this foolish Marshall law talk,STOP SAYING IT, STOP LIEING 
----------
yummyyo 4
	 Pause and Pray! If that doesn't work, you didn't do it. The VIP area is a 5-foot radius around me always. I'm fun; have fun with me!
	 http://t.co/aOGje7ZPlP 
----------
itsablackguy 4
	 if u try to roast me or whatever, just remember i been fat for over 20 years. ive heard all the jokes https://t.co/ebOPMbzW2W
	 My coworkers haven't mentioned #Ferguson all week. I'm not surprised though... 
----------
KeithJonesJr 4
	 Public Servant, Campaign Manager, Socialite, Club Promoter, Social Activist for the LGBT movement, future lawyer & MOREHOUSE Man. #RGODC #Leo Kik: KeithJonesJr
	 I see many lawsuits coming. #Ferguson 
----------
ronnyshreve 3
	 Freelance Journalist. Host of the Ron Shreve Show. Reporting on Government corruption, the police state, and crony capitalism ronshrevetips@yahoo.com
	 The police in  #Ferguson are criminal scum and need to be arrested and put in prison! #WakeUpNow #tcot 
----------
minnman47 3
	 world affairs in the crosshairs...trigger finger on the pulse of society
	 http://t.co/Bi36ltzFBE 
----------

Hashtags By Location

We've already looked at popular hashtags over the course of the day. How does this usage change from inside Ferguson to outside?


In [31]:
inStlHashtagCounter = {}

for tweet in inStLouisTweets:
    hashtagList = tweet["entities"]["hashtags"]

    for hashtagObj in hashtagList:
        hashtagString = hashtagObj["text"].lower()

        if ( hashtagString not in inStlHashtagCounter ):
            inStlHashtagCounter[hashtagString] = 1
        else:
            inStlHashtagCounter[hashtagString] += 1

print ("Unique Hashtags in Ferguson:", len(inStlHashtagCounter.keys()))
sortedInStlHashtags = sorted(inStlHashtagCounter, key=inStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags in Ferguson:")
for ht in sortedInStlHashtags[:20]:
    print ("\t", "#" + ht, inStlHashtagCounter[ht])


Unique Hashtags in Ferguson: 19
Top Twenty Hashtags in Ferguson:
	 #ferguson 64
	 #mikebrown 9
	 #michaelbrown 3
	 #ksdk 3
	 #stl 3
	 #justiceformikebrown 2
	 #vice 1
	 #freefrench 1
	 #icebucketchallenge 1
	 #neoamerica 1
	 #moleg 1
	 #noco 1
	 #sandyhook 1
	 #citizenjournalists 1
	 #rai 1
	 #ksdknews 1
	 #barackobama 1
	 #fergusonshooting 1
	 #free 1

In [32]:
outStlHashtagCounter = {}

for tweet in outStLouisTweets:
    hashtagList = tweet["entities"]["hashtags"]

    for hashtagObj in hashtagList:
        hashtagString = hashtagObj["text"].lower()

        if ( hashtagString not in outStlHashtagCounter ):
            outStlHashtagCounter[hashtagString] = 1
        else:
            outStlHashtagCounter[hashtagString] += 1

print ("Unique Hashtags Outside Ferguson:", len(outStlHashtagCounter.keys()))
sortedOutStlHashtags = sorted(outStlHashtagCounter, key=outStlHashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags Outside Ferguson:")
for ht in sortedOutStlHashtags[:20]:
    print ("\t", "#" + ht, outStlHashtagCounter[ht])


Unique Hashtags Outside Ferguson: 273
Top Twenty Hashtags Outside Ferguson:
	 #ferguson 1199
	 #mikebrown 63
	 #mediablackout 37
	 #fergusonriot 18
	 #prayforferguson 16
	 #fergusonshooting 15
	 #gaza 15
	 #justiceformikebrown 12
	 #michaelbrown 10
	 #policestate 9
	 #inners 9
	 #dontshoot 9
	 #policebrutality 7
	 #tcot 7
	 #police 6
	 #ripmikebrown 6
	 #anonymous 6
	 #stl 6
	 #handsupdontshoot 5
	 #iftheygunnedmedown 4

Module 3: Media in Twitter

Twitter is excellent for sharing media, either photographs, movies, or links websites. When you share pictures, Twitter stores them and links to them directly. We can use this data to sample some random pictures taken from each hour of the data we have.

We'll look at:

  • Images By Hour
  • Images Inside Ferguson
  • Images Outside Ferguson

Images By Hour

First, we need to reduce our map of minutes->tweets to hours->tweets.


In [33]:
hourlyInterval = {}

for t in sortedTimes:
    newTime = t.replace(second=0, minute=0)
    
    currentTimeObject = frequencyMap[t]
    if ( newTime not in hourlyInterval ):
        hourlyInterval[newTime] = {
            "count": currentTimeObject["count"],
            "list": currentTimeObject["list"]
            }
    else:
        hourlyInterval[newTime]["count"] += currentTimeObject["count"]
        hourlyInterval[newTime]["list"] = hourlyInterval[newTime]["list"] + currentTimeObject["list"]

Then we filter out retweets and keep only those tweets with a media listing in the "entities" section. Then, we select a random image from the list of pictures for that hour and display it.


In [34]:
from IPython.display import display
from IPython.display import Image

for h in sorted(hourlyInterval.keys()):
    noRetweets = list(filter(lambda tweet: not tweet["text"].lower().startswith("rt"), hourlyInterval[h]["list"]))
    tweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], noRetweets))
    print (h, hourlyInterval[h]["count"], len(tweetsWithMedia), )
    
    randIndex = np.random.random_integers(0, len(tweetsWithMedia)-1, size=1)
    imgUrl = tweetsWithMedia[randIndex]["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))


2014-08-13 16:00:00 1004 20
2014-08-13 17:00:00 2098 34
2014-08-13 18:00:00 1542 31
2014-08-13 19:00:00 1453 28
2014-08-13 20:00:00 2390 48
2014-08-13 21:00:00 2139 67
2014-08-13 22:00:00 2011 57
2014-08-13 23:00:00 5943 149
2014-08-14 00:00:00 10551 166
2014-08-14 01:00:00 16081 196
2014-08-14 02:00:00 30646 318
2014-08-14 03:00:00 32112 416
2014-08-14 04:00:00 26010 330
2014-08-14 05:00:00 18796 288
2014-08-14 06:00:00 17034 269
2014-08-14 07:00:00 18170 278
2014-08-14 08:00:00 9649 173
2014-08-14 09:00:00 4929 83
2014-08-14 10:00:00 4299 87
2014-08-14 11:00:00 5740 112
2014-08-14 12:00:00 7297 130
2014-08-14 13:00:00 11649 226
2014-08-14 14:00:00 15035 244
2014-08-14 15:00:00 15784 304
2014-08-14 16:00:00 17571 324
2014-08-14 17:00:00 13627 218

Pictures from Inside Ferguson, MO

We can also extract images people tweeted from Ferguson.


In [35]:
stlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], inStLouisTweets))
print ("Tweets with Media:", len(stlTweetsWithMedia))

for tweet in stlTweetsWithMedia:
    imgUrl = tweet["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))


Tweets with Media: 16

Pictures from Outside Ferguson, MO

Here, we extract 10 random images from outside Ferguson.


In [37]:
outStlTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], outStLouisTweets))
print ("Tweets outside St. Louis with Media:", len(outStlTweetsWithMedia))

np.random.shuffle(outStlTweetsWithMedia)
for tweet in outStlTweetsWithMedia[:10]:
    imgUrl = tweet["entities"]["media"][0]["media_url"]
    display(Image(url=imgUrl))


Tweets outside St. Louis with Media: 188

Module 4: Sentiment Analysis

Another popular type of analysis people do on social networks is "sentiment analysis," which is used to figure out how people feel about a specific topic.

One way to explore sentiment is to use a list of keywords with tagged sentiment information (e.g., "happy" or "awesome" might have high sentiment whereas "terrible" or "awful" might have very low sentiment). Then, we can count the occurrence of these tagged keywords to get a sense of how people feel about the topic at hand.

We use the AFINN Sentiment Dictionary for our keyword list. Link here: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010

  • Emotive Keywords and Emoticons
  • Per-Tweet Average Sentiment
  • Sentiment Over Time
  • GIS + Sentiment

In [38]:
import re

# Read in the sentiment/valence files
dataFilePath = os.path.join("data_files", "SentiStrength")
valenceFile = os.path.join(dataFilePath, "EmotionLookupTable.txt")
emoticonFile = os.path.join(dataFilePath, "EmoticonLookupTable.txt")

valenceList = []

# Open the valence file and read in each word/valence pair
for line in open(valenceFile, "r"):
    # Split the line based on tabs and select the first two elements
    (word, valence) = line.split("\t")[:2]
    
    wordRegex = re.compile(word)
    valencePair = (wordRegex, int(valence))
    valenceList.append(valencePair)
    
# Open the emoticon file and read in the valence for each emoticon
for line in codecs.open(emoticonFile, "r", "utf-8"):
    # Split the line based on tabs and select the first two elements
    (emoticon, valence) = line.split("\t")[:2]
    
    emoticonRegex = re.compile(re.escape(emoticon))
    valencePair = (emoticonRegex, int(valence))
    valenceList.append(valencePair)
    
print ("Number of Sentiment Keywords:", len(valenceList))


Number of Sentiment Keywords: 2659

In [39]:
# Examples of sentiment pairs
for i in np.random.random_integers(0, len(valenceList)-1, 10):
    print(valenceList[i][0].pattern, "\t", valenceList[i][1])


drown[a-z]* 	 -2
desirable 	 4
intolleran[a-z]* 	 -3
hopefully 	 1
plays 	 2
pained 	 -2
dirt 	 -2
intimidat[a-z]* 	 -4
XP 	 1
disagre[a-z]* 	 -2

In [40]:
# Generate sentiment measures for each time
timeSentiments = {}
for t in sortedTimes:
    
    tweetList = frequencyMap[t]["list"]
    sentimentList = []
    thisMinuteSentiment = None
    
    for tweet in tweetList:
        
        # Calculate the average sentiment for this tweet
        tweetText = tweet["text"].lower()

        # skip retweets
        if ( tweetText.startswith("rt ") ):
            continue

        valCount = 0
        valSum = 0.0
        valAvg = 0.0
        for valencePair in valenceList:
            if ( valencePair[0].search(tweetText) is not None ):
                valCount += 1
                valSum += valencePair[1]

        if ( valCount > 0 ):
            valAvg = valSum / valCount
            sentimentList.append(valAvg)
    
    if ( len(sentimentList) > 0 ):
        thisMinuteSentiment = np.array(sentimentList).mean()
    else:
        thisMinuteSentiment = 0.0
        
    timeSentiments[t] = thisMinuteSentiment

In [41]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Sentiment Over Time")

postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]
sentList = [timeSentiments[x] for x in sortedTimes]

smallerXTicks = range(0, len(sortedTimes), 30)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")

ax2 = ax.twinx()
ax2.plot([0], [0], color="blue", label="Posts")
ax2.plot(range(len(frequencyMap)), sentList, color="green", label="Sentiment")
ax2.set_ylim(-6,6)

ax.grid(b=True, which=u'major')
ax2.legend()

plt.show()


Based on this data, we can see that most people are pretty unhappy with the events in Ferguson, MO. This result is not all that unexpected.

GIS + Sentiment


In [42]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Sentiment Histrogram")

for (loc, (tweetList, color)) in {"Inside": (inStLouisTweets, "green"), "Outside": (outStLouisTweets, "blue")}.items():

    localSentimentList = []
    for tweet in tweetList:

        # Calculate the average sentiment for this tweet
        tweetText = tweet["text"].lower()

        # skip retweets
        if ( tweetText.startswith("rt ") ):
            continue

        valCount = 0
        valSum = 0.0
        valAvg = 0.0
        for valencePair in valenceList:
            if ( valencePair[0].search(tweetText) is not None ):
                valCount += 1
                valSum += valencePair[1]

        if ( valCount > 0 ):
            valAvg = valSum / valCount
            localSentimentList.append(valAvg)

    print("Number of Sentiment Tweets:", len(localSentimentList))

    ax.hist(localSentimentList, range=(-5, 5), normed=True, alpha=0.5, color=color, label=loc)

ax.grid(b=True, which=u'major')
ax.legend()

plt.show()


Number of Sentiment Tweets: 88
Number of Sentiment Tweets: 1799

Module 5: Topic Modeling

Along with sentiment analysis, a question often asked of social networks is "What are people talking about?" We can answer this question using tools from topic modeling and natural language processing, and we can even divide this data to see what people in Ferguson are talking about versus those outside.

To generate these topic models, we will use the Gensim package's implementation of Latent Dirichlet Allocation (LDA), which basically constructs a set of topics where each topic is described as a probability distribution over the words in our tweets. Several other methods for topic modeling exist as well.

  • Topics Across Twitter
  • Topics in Ferguson
  • Topics outside Ferguson

In [43]:
import gensim.models.ldamodel
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from nltk.corpus import stopwords

We first extract the text of all English tweets that are not retweets and make the text lowercase.


In [44]:
enFilter = lambda x: True if x["lang"] == "en" else False

# Get all tweets, filter out retweets, save only those in English, and conver to lowercase
allTweetList = reduce(lambda x, y: x + y, [frequencyMap[t]["list"] for t in sortedTimes])
noRetweetsList = list(filter(lambda x: not x["text"].lower().startswith("rt"), allTweetList))
onlyEnglishTweets = list(filter(enFilter, noRetweetsList))
lowerTweetText = [x["text"].lower() for x in onlyEnglishTweets]

print ("All Tweet Count:", len(allTweetList))
print ("Reduced Tweet Count:", len(lowerTweetText))


All Tweet Count: 293560
Reduced Tweet Count: 57121

Now we build a list of stop words (words we don't care about) and build a feature generator (the vectorizer) that assigns integer keys to tokens and counts the number of each token.


In [45]:
enStop = stopwords.words('english')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + ["http", "https", "rt", "@", ":"]

vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode', 
                                                             tokenizer=None,
                                                             token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
                                                             stop_words=stopList,
                                                             binary=True)
# Create a vectorizer for all our content
vectorizer.fit(lowerTweetText)

# Get all the words in our text
names = vectorizer.get_feature_names()

# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))

Topics Across Twitter

We then use the vectorizer to transform our tweet text into a feature set, which essentially is a table with rows of tweets, columns for each keyword, and each cell is the number of times that keyword appears in that tweet.

We then convert that table into a model the Gensim package can handle, apply LDA, and grab the top 10 topics, 10 words that describe that topic, and print them.


In [46]:
# Create a corpus for 
corpus = vectorizer.transform(lowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
# lda = gensim.models.ldamodel.LdaModel(gsCorpus, id2word=id2WordDict, num_topics=10)
lda = gensim.models.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=100, passes=2)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)


Topic: 0
	 believe
	 #ferguson
	 can't
	 ferguson
	 hard
	 images
	 police
	 terrifying
	 co
	 going
Topic: 1
	 #ferguson
	 investigate
	 co
	 step
	 ferguson
	 five
	 line
	 fair
	 police
	 #palestine
Topic: 2
	 co
	 bullets
	 rubber
	 police
	 ferguson
	 clash
	 gas
	 tear
	 #ferguson
	 protesters
Topic: 3
	 #ferguson
	 co
	 act
	 military
	 police
	 ferguson
	 like
	 shows
	 become
	 #missouri
Topic: 4
	 disney
	 ferguson
	 channel
	 #ferguson
	 bc
	 y'all
	 playing
	 shows
	 people
	 days
Topic: 5
	 ferguson
	 people
	 #ferguson
	 officials
	 mad
	 pres
	 side
	 videos
	 obama
	 distract
Topic: 6
	 brown
	 co
	 ferguson
	 live
	 watch
	 mike
	 michael
	 mo
	 livestream
	 shooting
Topic: 7
	 ferguson
	 #ferguson
	 peace
	 pray
	 thank
	 god
	 people
	 idea
	 heart
	 living
Topic: 8
	 #ferguson
	 police
	 cameras
	 co
	 turn
	 ones
	 ferguson
	 ridiculous
	 shouldn't
	 animals
Topic: 9
	 rights
	 #ferguson
	 civil
	 ferguson
	 amendment
	 different
	 co
	 tone
	 1st
	 movement

Topics Inside Ferguson

We do the same thing with only those tweets in Ferguson to find topics people are discussing there.


In [47]:
inStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, inStLouisTweets)]

corpus = vectorizer.transform(inStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=10, passes=10)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)


Topic: 0
	 #ferguson
	 co
	 ferguson
	 #mikebrown
	 peaceful
	 police
	 stand
	 arrested
	 antoniofrench
	 different
Topic: 1
	 ferguson
	 police
	 like
	 #ferguson
	 situation
	 actions
	 co
	 obama
	 seems
	 media
Topic: 2
	 #ferguson
	 ferguson
	 #mikebrown
	 outside
	 much
	 sen
	 mccaskill
	 got
	 protests
	 freedom
Topic: 3
	 #ferguson
	 police
	 report
	 co
	 protest
	 time
	 wants
	 conversation
	 one
	 media
Topic: 4
	 man
	 ferguson
	 right
	 chief
	 antoniofrench
	 news
	 justice
	 twitter
	 conference
	 wrapped
Topic: 5
	 right
	 cops
	 around
	 watching
	 citizens
	 still
	 can't
	 covering
	 20
	 miles
Topic: 6
	 #ferguson
	 co
	 police
	 ferguson
	 simply
	 i'm
	 it's
	 way
	 tweets
	 tear
Topic: 7
	 ferguson
	 #ferguson
	 co
	 mo
	 shot
	 i'm
	 going
	 chief
	 mattdpearce
	 amp
Topic: 8
	 ferguson
	 #ferguson
	 co
	 city
	 outside
	 people
	 crowds
	 100
	 hall
	 i'm
Topic: 9
	 #ferguson
	 co
	 situation
	 police
	 stl
	 says
	 behind
	 something
	 like
	 well

Topics Outside Ferguson


In [48]:
outStlLowerTweetText = [x["text"].lower() for x in filter(enFilter, outStLouisTweets)]

corpus = vectorizer.transform(outStlLowerTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
lda = gensim.models.ldamulticore.LdaMulticore(gsCorpus, id2word=id2WordDict, num_topics=50, passes=10)

ldaTopics = lda.show_topics(num_topics=10, num_words=10, formatted=False)
topicTokens = [[token for (_,token) in topic] for topic in ldaTopics]
for i in range(len(topicTokens)):
    print ("Topic:", i)
    for token in topicTokens[i]:
        print ("\t", token)


Topic: 0
	 #ferguson
	 co
	 ferguson
	 amp
	 unarmed
	 shooting
	 arrested
	 live
	 reporters
	 police
Topic: 1
	 #ferguson
	 co
	 ferguson
	 don't
	 team
	 #mikebrown
	 want
	 go
	 johnlegend
	 never
Topic: 2
	 #ferguson
	 channel
	 ferguson
	 disney
	 worse
	 instead
	 twitter
	 went
	 pretty
	 world
Topic: 3
	 ferguson
	 #ferguson
	 y'all
	 hood
	 story
	 bring
	 way
	 black
	 disney
	 don't
Topic: 4
	 #ferguson
	 ferguson
	 amp
	 it's
	 that's
	 officer
	 police
	 i'm
	 come
	 state
Topic: 5
	 #ferguson
	 co
	 ferguson
	 happened
	 going
	 police
	 #fergusonriot
	 damn
	 missouri
	 time
Topic: 6
	 #ferguson
	 co
	 police
	 ferguson
	 news
	 ridiculous
	 obama
	 heart
	 watching
	 it's
Topic: 7
	 ferguson
	 #ferguson
	 police
	 shit
	 know
	 going
	 people
	 co
	 cops
	 even
Topic: 8
	 #ferguson
	 co
	 real
	 police
	 ferguson
	 crews
	 leave
	 area
	 asked
	 news
Topic: 9
	 #ferguson
	 police
	 co
	 chief
	 moment
	 ferguson
	 #fergusonriot
	 using
	 one
	 events

Module 6: Network Analysis

Issues of race, class, poverty, and police militarization all came out during the protests and clashes with law enforcement, and it didn't take much to find people on either side of each issue on Twitter. At the same time, people were turning to Twitter for news about the events on the ground since many perceived that mainstream media wasn't giving the events adequate or fair coverage. Using network analysis, we can get some idea about who the most important Twitter users were during this time, and how people split into groups online.

For this analysis, we'll use the NetworkX package to construct a social graph of how people interact. Each person in our Twitter data will be a node in our graph, and edges in the graph will represent mentions during this timeframe. Then we will explore a few simple analytical methods in network analysis, including:

  • Graph Building
  • User Centrality
  • Network Visualization

Graph Building

To limit the amount of data we're looking at, we'll only build the network for people who have GPS locations in their tweets and the people they mention. We build this network simply by iterating through all the tweets in our GPS list and extract the "user_mentions" list from the "entities" section of the tweet object. For each mention a user makes, we will add an edge from that user to the user he/she mentioned.

In addition, we will append a location attribute to each user based on whether we saw them in Ferguson or outside of Ferguson.


In [49]:
import networkx as nx

graph = nx.DiGraph()

geoCodedMap = {1: inStLouisTweets, 0: outStLouisTweets}

for (location, locationList) in geoCodedMap.items():
    print (location, len(locationList))
    
    for tweet in locationList:
        userName = tweet["user"]["screen_name"]
        graph.add_node(userName, loc=location)

        mentionList = tweet["entities"]["user_mentions"]
        
        for otherUser in mentionList:
            otherUserName = otherUser["screen_name"]
            if ( graph.has_node(otherUserName) == False ):
                graph.add_node(otherUserName, loc=-1)
            graph.add_edge(userName, otherUserName)
        
print ("Number of Users:", len(graph.node))


0 1900
1 100
Number of Users: 2345

Central Users

In network analysis, "centrality" is used to measure the importance of a given node. Many different types of centrality are used to describe various types of importance though. Examples include "closeness centrality," which measures how close a node is to all other nodes in the network, versus "betweeness centrality," which measures how many shortest paths run through the given node. Nodes with high closeness centrality are important for rapidly disseminating information or spreading disease, whereas nodes with high betweeness are more important to ensure the network stays connected.

The PageRank is another algorithm for measuring importance and was proposed by Sergey Brin and Larry Page for the early version of Google's search algorithm. NetworkX has an implementation of the PageRank algorithm that we can use to look at the most important/authoritative users on Twitter based on their connections to other users.


In [50]:
pageRankList = nx.pagerank_numpy(graph)

highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
    user = api.get_user(x)
    print (x, pageRankList[x], "\n\t", user.description, "\n----------")


BarackObama 0.008778991145676325 
	 This account is run by Organizing for Action staff. Tweets from the President are signed -bo. 
----------
WesleyLowery 0.0063034776506989 
	 seek truth :: give voice to the voiceless :: shine light in the darkness 
----------
AntonioFrench 0.004588161213234277 
	 Dad, Husband, Alderman of the @21stWard in St. Louis, founder of @North_Campus, Auburn grad and devoted Auburn Football fan, @WUSTL MBA. 
----------
CNN 0.0027568741839318553 
	 It’s our job to #GoThere and tell the most difficult stories. Come with us! 
----------
TheAnonMessage 0.002739265654803952 
	 Coming soon. 
----------
GovJayNixon 0.0027146137140248924 
	 Official Twitter feed of Missouri Governor Jay Nixon 
----------
ThoughtCatalog 0.0025173981877923404 
	 All thinking is relevant. 
----------
YourAnonNews 0.0024680943062341918 
	 Signal boost for Anonymous operations, resistance movements, & journalism.  #ShutItDown 
----------
JavoPerez_ 0.0023201826615596288 
	 Employed | IE | Tread Lightly | IG: Javoperez_ 
----------
JohnnyLoud_Pack 0.002320182661559628 
	 Living this #SagLife as a #Dreadhead... I Aint Shit Yet!!! Cleveland 216 #footlocker #towercity 
----------
DLRMiller 0.0023201826615596253 
	 I like drawing, painting, and chocolate. 
----------
washingtonpost 0.0022462268392225634 
	 Tweets about everything from breaking news to bad restaurants. Sometimes global, sometimes local. Led by @hermanywong and @MiGold. 
----------
maddow 0.00219692295766441 
	 I see political people...
(Retweets do not imply endorsement.) 
----------
FoxNews 0.0021476190761062636 
	 America’s Strongest Primetime Lineup Anywhere! Follow America's #1 cable news network, delivering you breaking news, insightful analysis, and must-see videos. 
----------
msnbc 0.001920949282968822 
	 The place for in-depth analysis, political commentary and informed perspectives. Have questions? Tweet to @Farrashley, @NishaChittal and @digimuller. 
----------
chrislhayes 0.0016075174644920428 
	 Host of All In with Chris Hayes on MSNBC, Weeknights at 8pm. Editor at Large at The Nation. Cubs fan. 
----------
ryanjreilly 0.0015806244381876187 
	 @HuffingtonPost Justice Reporter on #DOJ, #SCOTUS, #Guantanamo, #Ferguson et al.
Previously: @TPM, @MainJustice
202-527-9261 
ryan.reilly@huffingtonpost.com 
----------
ksdknews 0.0014425735698248158 
	 Where the News Comes First
for breaking news, weather and 
sports both on-air and online. Retweets are not endorsements. #STLTogether 
----------
jonswaine 0.001408060852734104 
	 Senior reporter for @GuardianUS jon.swaine@theguardian.com 
----------
natedrug 0.0013834089119550623 
	 dead inside. 
----------

Network Visualization

A lot of information can be gleaned from visualizing how these networks interact. In Python, we can plot these networks relatively easily.


In [51]:
print (len(graph.nodes(data=True)))

colors = [0.9 if x[1]["loc"] == 1 else 0.1 for x in graph.nodes(data=True)]
pos = {x:(np.random.rand(2) * 10) for x in graph.nodes()}
nx.draw_networkx_nodes(graph, pos, node_color=colors)
nx.draw_networkx_edges(graph, pos)


2345
Out[51]:
<matplotlib.collections.LineCollection at 0x2e2c9bb38>

This graph is relatively uninformative, so we will turn to other tools for better visualization.

We first save this graph to a file, so we can import into other tools.


In [52]:
nx.write_graphml(graph, "inVsOutNetwork.graphml", encoding='utf-8', prettyprint=False)

NodeXL Demo


In [ ]:


In [ ]:
# If you want to play with the full graph, 
# here is code that will build it up for you.
# Be careful. It's large.

fullGraph = nx.DiGraph()

inStlUsers = set(map(lambda x: x["user"]["screen_name"], inStLouisTweets))
outStlUsers = set(map(lambda x: x["user"]["screen_name"], outStLouisTweets))

for (userName, tweetList) in globalUserMap.items():
    
    location = -1
    if ( userName in inStlUsers ):
        location = 1
    elif (userName in outStlUsers ):
        location = 0
        
    fullGraph.add_node(userName, loc=location)

    for tweet in tweetList:
        mentionList = tweet["entities"]["user_mentions"]

        for otherUser in mentionList:
            otherUserName = otherUser["screen_name"]
            if ( fullGraph.has_node(otherUserName) == False ):
                fullGraph.add_node(otherUserName, loc=-1)
            fullGraph.add_edge(userName, otherUserName)
            
print ("Number of Users:", len(fullGraph.node))

nx.write_graphml(fullGraph, "fullNetwork.graphml", encoding='utf-8', prettyprint=False)