Text Analytics

In which we analyze the Mood of the nation from inferences on SOTU by the POTUS

(State Of The Union addresses by the President Of The US)

Goal is to find interesting words in the speeches that reflect the times.

Am sure Lincoln didn't worry about WMDs and Iraq; neither did George Washington about inflation, Wall Street and Jobs.

In [1]:
from pyspark.context import SparkContext
print "Running Spark Version %s" % (sc.version)

Running Spark Version 1.6.0

In [2]:
from pyspark.conf import SparkConf
conf = SparkConf()
print conf.toDebugString()


MapReduce in one line !

1. Split lines into words on space

2. Create key-value pair with key=word, value = 1

3. Sum value for each word (er ... key)

4. Then we get key-value RDD with key=word and value = number of times the word occured in a document

In [3]:
# imports
from operator import add

In [4]:
lines = sc.textFile("sotu/2009-2014-BO.txt")
word_count_bo = lines.flatMap(lambda x: x.split(' ')).\
    map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).\
#6658 without lower, 6299 with lower, rstrip,lstrip 4835


In [5]:
lines = sc.textFile("sotu/2009-2015-BO.txt")
word_count_bo_2015 = lines.flatMap(lambda x: x.split(' ')).\
    map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.').replace(u"\u2019", "'"), 1)).\


In [6]:
#output = word_count_bo.collect()
#for (word, count) in output:
#    print "%s: %i" % (word, count)

In [7]:
lines = sc.textFile("sotu/2001-2008-GWB.txt")
word_count_gwb = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [8]:
lines = sc.textFile("sotu/1994-2000-WJC.txt")
word_count_wjc = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [9]:
lines = sc.textFile("sotu/1961-1963-JFK.txt")
word_count_jfk = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [10]:
lines = sc.textFile("sotu/1934-1945-FDR.txt")
word_count_fdr = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [11]:
lines = sc.textFile("sotu/1861-1864-AL.txt")
word_count_al = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [12]:
lines = sc.textFile("sotu/1790-1796-GW.txt")
word_count_gw = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)


In [13]:
common_words = ["","us","has","all", "they", "from", "who","what","on","by","more","as","not","their","can",

In [14]:
word_count_bo_1 = word_count_bo.sortBy(lambda x: x[1],ascending=False)

In [15]:
for x in word_count_bo_1.take(10):
    print x

(u'the', 1812)
(u'and', 1375)
(u'to', 1351)
(u'of', 1013)
(u'a', 802)
(u'that', 778)
(u'we', 719)
(u'our', 698)
(u'in', 637)
(u'', 585)

In [16]:
word_count_bo_clean = word_count_bo_1.filter(lambda x: x[0] not in common_words)

In [17]:


In [18]:
for x in word_count_bo_clean.take(20):
    print x

(u'jobs', 148)
(u'people', 144)
(u'american', 133)
(u'america', 131)
(u'years', 116)
(u'work', 108)
(u'americans', 105)
(u'time', 89)
(u'energy', 87)
(u'tonight', 84)
(u'congress', 83)
(u'country', 82)
(u'help', 81)
(u'economy', 79)
(u'tax', 76)
(u'right', 75)
(u'businesses', 69)
(u'my', 65)
(u'world', 63)
(u'government', 58)

In [19]:
word_count_bo_2015_clean = word_count_bo_2015.filter(lambda x: x[0] not in common_words)

In [20]:
word_count_gwb_clean = word_count_gwb.filter(lambda x: x[0] not in common_words)
for x in word_count_gwb_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'america', 207)
(u'people', 158)
(u'must', 153)
(u'world', 131)
(u'country', 108)
(u'american', 104)
(u'americans', 99)
(u'congress', 99)
(u'security', 98)
(u'help', 90)
(u'nation', 88)
(u'terrorists', 83)
(u'iraq', 80)
(u'freedom', 79)
(u'tonight', 76)

In [21]:
word_count_wjc_clean = word_count_wjc.filter(lambda x: x[0] not in common_words)
for x in word_count_wjc_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'must', 289)
(u'people', 266)
(u'work', 194)
(u'america', 188)
(u'years', 176)
(u'children', 153)
(u'americans', 152)
(u'congress', 147)
(u'american', 136)
(u'help', 117)
(u'care', 116)
(u'world', 108)
(u'health', 102)
(u'tonight', 98)
(u'support', 93)

In [22]:
word_count_jfk_clean = word_count_wjc.filter(lambda x: x[0] not in common_words)
for x in word_count_jfk_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'must', 289)
(u'people', 266)
(u'work', 194)
(u'america', 188)
(u'years', 176)
(u'children', 153)
(u'americans', 152)
(u'congress', 147)
(u'american', 136)
(u'help', 117)
(u'care', 116)
(u'world', 108)
(u'health', 102)
(u'tonight', 98)
(u'support', 93)

In [23]:
word_count_fdr_clean = word_count_fdr.filter(lambda x: x[0] not in common_words)
for x in word_count_fdr_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'war', 238)
(u'world', 167)
(u'must', 161)
(u'government', 154)
(u'people', 141)
(u'national', 130)
(u'other', 124)
(u'nation', 122)
(u'nations', 111)
(u'peace', 111)
(u'united', 106)
(u'congress', 105)
(u'american', 97)
(u'many', 94)
(u'great', 91)

In [24]:
word_count_al_clean = word_count_al.filter(lambda x: x[0] not in common_words)
for x in word_count_al_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'states', 148)
(u'upon', 84)
(u'united', 81)
(u'any', 79)
(u'congress', 77)
(u'government', 73)
(u'people', 70)
(u'other', 69)
(u'war', 64)
(u'country', 62)
(u'great', 61)
(u'union', 53)
(u'shall', 53)
(u'time', 51)
(u'under', 50)

In [25]:
word_count_gw_clean = word_count_gw.filter(lambda x: x[0] not in common_words)
for x in word_count_gw_clean.sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'states', 91)
(u'united', 86)
(u'public', 54)
(u'your', 47)
(u'government', 47)
(u'made', 39)
(u'upon', 38)
(u'my', 37)
(u'other', 37)
(u'citizens', 33)
(u'state', 32)
(u'country', 31)
(u'shall', 30)
(u'peace', 30)
(u'present', 28)

Has Barack Obama changed in 2015 ?

As reflected in the SOTU 2009-2015 vs SOTU 2009-2014 ?

In [26]:
for x in word_count_bo_2015_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'childcare', 8)
(u'rebekah', 7)
(u'economics', 5)
(u'believed', 4)
(u'cuba', 4)
(u'ben', 3)
(u'(applause)', 3)
(u'fears', 3)
(u'twenty-first', 3)
(u"ben's", 2)
(u'speech', 2)
(u'sights', 2)
(u'keeper', 2)
(u'misguided', 2)
(u'constant', 2)

Coding Exercise

What mood was the country in 1790-1796 vs 2009-2015 ?


1. word_count_gw_clean = 1790-1796-GW.txt

2. word_count_bo_2015_clean

In [27]:
for x in word_count_bo_clean.subtractByKey(word_count_gw_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'jobs', 148)
(u'america', 131)
(u'americans', 105)
(u'tonight', 84)
(u'help', 81)
(u'businesses', 69)
(u'health', 55)
(u'back', 53)
(u'job', 51)
(u'reform', 51)
(u'deficit', 48)
(u'down', 47)
(u'college', 40)
(u'today', 40)
(u"can't", 39)

In [28]:
for x in word_count_gw_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'present', 28)
(u'measures', 26)
(u'representatives:', 24)
(u'provision', 24)
(u'indians', 23)
(u'militia', 20)
(u'gentlemen', 19)
(u'ought', 19)
(u'object', 17)
(u'however', 16)
(u'satisfaction', 15)
(u'establishment', 15)
(u'due', 15)
(u'objects', 15)
(u'tribes', 14)

Now it is easy to see Obama vs. FDR or WJC vs. AL ...

In [29]:
for x in word_count_fdr_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'present', 39)
(u'japanese', 36)
(u'therefore', 31)
(u'essential', 31)
(u'enemy', 24)
(u'1942', 22)
(u'methods', 21)
(u'adequate', 21)
(u'principles', 20)
(u'peoples', 20)
(u'1933', 19)
(u'however', 19)
(u'objectives', 19)
(u'agriculture', 18)
(u'civilization', 18)

In [30]:
for x in word_count_bo_clean.subtractByKey(word_count_fdr_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'energy', 87)
(u'tonight', 84)
(u'businesses', 69)
(u'college', 40)
(u'schools', 36)
(u'students', 27)
(u'oil', 26)
(u'kids', 25)
(u'republicans', 25)
(u'innovation', 24)
(u'gas', 23)
(u'democrats', 23)
(u'research', 22)
(u'nuclear', 21)
(u'technology', 20)

In [31]:
for x in word_count_bo_clean.subtractByKey(word_count_wjc_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'oil', 26)
(u'manufacturing', 19)
(u'afghanistan', 18)
(u'al', 15)
(u'afghan', 15)
(u'iran', 12)
(u'solar', 12)
(u'qaeda', 12)
(u'code', 11)
(u'rebuilding', 10)
(u'worse', 10)
(u'infrastructure', 10)
(u'biden', 9)
(u'breaks', 9)
(u'high-speed', 8)

In [32]:
for x in word_count_bo_clean.subtractByKey(word_count_gwb_clean).sortBy(lambda x: x[1],ascending=False).take(15):
    print x

(u'banks', 22)
(u'industry', 20)
(u'trillion', 17)
(u'middle-class', 13)
(u'wage', 12)
(u'class', 12)
(u'forge', 11)
(u'lay', 11)
(u'lending', 11)
(u'walk', 11)
(u'helps', 10)
(u'restore', 10)
(u'minimum', 10)
(u'high-tech', 9)
(u'biden', 9)

In [33]:
for x in word_count_bo_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
    print x

(u'jobs', 148)
(u'tonight', 84)
(u'businesses', 69)
(u'families', 56)
(u'job', 51)
(u'deficit', 48)
(u'college', 40)
(u'today', 40)
(u"can't", 39)
(u'schools', 36)
(u'million', 36)
(u'workers', 35)
(u'clean', 35)
(u'hard', 32)
(u'budget', 30)

In [34]:
for x in word_count_wjc_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
    print x

(u'tonight', 98)
(u'families', 82)
(u'budget', 77)
(u'challenge', 76)
(u'parents', 70)
(u'schools', 66)
(u'jobs', 66)
(u'child', 65)
(u'million', 64)
(u'today', 60)
(u'crime', 60)
(u'21st', 58)
(u'college', 54)
(u'thank', 47)
(u'bill', 47)

In [35]:
for x in word_count_gwb_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
    print x

(u'terrorists', 83)
(u'iraq', 80)
(u'tonight', 76)
(u'terror', 63)
(u'weapons', 56)
(u'iraqi', 48)
(u'jobs', 36)
(u'workers', 36)
(u'al', 36)
(u'terrorist', 36)
(u'afghanistan', 35)
(u'qaeda', 32)
(u'goal', 32)
(u'nuclear', 32)
(u'million', 31)

In [36]:
for x in word_count_al_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
    print x

(u'persons', 30)
(u'naval', 27)
(u'present', 27)
(u'emancipation', 25)
(u'consideration', 22)
(u'receipts', 22)
(u'however', 21)
(u'measures', 20)
(u'slavery', 19)
(u'proclamation', 17)
(u'vessels', 17)
(u'indian', 17)
(u'believed', 16)
(u'powers', 16)
(u'actual', 16)

In [37]:
for x in word_count_al_clean.subtractByKey(word_count_wjc_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
    print x

(u'persons', 30)
(u'navy', 28)
(u'condition', 28)
(u'naval', 27)
(u'emancipation', 25)
(u'thus', 23)
(u'consideration', 22)
(u'receipts', 22)
(u'claims', 20)
(u'interior', 18)
(u'vessels', 17)
(u'proclamation', 17)
(u'powers', 16)
(u'believed', 16)
(u'actual', 16)

That is All Folks !

In [ ]: