In [1]:
from pyspark.context import SparkContext
print "Running Spark Version %s" % (sc.version)
In [2]:
from pyspark.conf import SparkConf
conf = SparkConf()
print conf.toDebugString()
In [3]:
# imports
from operator import add
In [4]:
lines = sc.textFile("sotu/2009-2014-BO.txt")
word_count_bo = lines.flatMap(lambda x: x.split(' ')).\
map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).\
reduceByKey(add)
word_count_bo.count()
#6658 without lower, 6299 with lower, rstrip,lstrip 4835
Out[4]:
In [5]:
lines = sc.textFile("sotu/2009-2015-BO.txt")
word_count_bo_2015 = lines.flatMap(lambda x: x.split(' ')).\
map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.').replace(u"\u2019", "'"), 1)).\
reduceByKey(add)
word_count_bo_2015.count()
Out[5]:
In [6]:
#output = word_count_bo.collect()
#for (word, count) in output:
# print "%s: %i" % (word, count)
In [7]:
lines = sc.textFile("sotu/2001-2008-GWB.txt")
word_count_gwb = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_gwb.count()
Out[7]:
In [8]:
lines = sc.textFile("sotu/1994-2000-WJC.txt")
word_count_wjc = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_wjc.count()
Out[8]:
In [9]:
lines = sc.textFile("sotu/1961-1963-JFK.txt")
word_count_jfk = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_jfk.count()
Out[9]:
In [10]:
lines = sc.textFile("sotu/1934-1945-FDR.txt")
word_count_fdr = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_fdr.count()
Out[10]:
In [11]:
lines = sc.textFile("sotu/1861-1864-AL.txt")
word_count_al = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_al.count()
Out[11]:
In [12]:
lines = sc.textFile("sotu/1790-1796-GW.txt")
word_count_gw = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x.lower().rstrip().lstrip().rstrip(',').rstrip('.'), 1)).reduceByKey(add)
word_count_gw.count()
Out[12]:
In [13]:
common_words = ["","us","has","all", "they", "from", "who","what","on","by","more","as","not","their","can",
"new","it","but","be","are","--","i","have","this","will","for","with","is","that","in",
"our","we","a","of","to","and","the","that's","or","make","do","you","at","it\'s","than",
"if","know","last","about","no","just","now","an","because","<p>we","why","we\'ll","how",
"two","also","every","come","we've","year","over","get","take","one","them","we\'re","need",
"want","when","like","most","-","been","first","where","so","these","they\'re","good","would",
"there","should","-->","<!--","up","i\'m","his","their","which","may","were","such","some",
"those","was","here","she","he","its","her","his","don\'t","i\'ve","what\'s","didn\'t",
"shouldn\'t","(applause.)","let\'s","doesn\'t","(laughter.)"]
In [14]:
word_count_bo_1 = word_count_bo.sortBy(lambda x: x[1],ascending=False)
In [15]:
for x in word_count_bo_1.take(10):
print x
In [16]:
word_count_bo_clean = word_count_bo_1.filter(lambda x: x[0] not in common_words)
In [17]:
word_count_bo_clean.count()
Out[17]:
In [18]:
for x in word_count_bo_clean.take(20):
print x
In [19]:
word_count_bo_2015_clean = word_count_bo_2015.filter(lambda x: x[0] not in common_words)
In [20]:
word_count_gwb_clean = word_count_gwb.filter(lambda x: x[0] not in common_words)
word_count_gwb_clean.count()
for x in word_count_gwb_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [21]:
word_count_wjc_clean = word_count_wjc.filter(lambda x: x[0] not in common_words)
word_count_wjc_clean.count()
for x in word_count_wjc_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [22]:
word_count_jfk_clean = word_count_wjc.filter(lambda x: x[0] not in common_words)
word_count_jfk_clean.count()
for x in word_count_jfk_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [23]:
word_count_fdr_clean = word_count_fdr.filter(lambda x: x[0] not in common_words)
word_count_fdr_clean.count()
for x in word_count_fdr_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [24]:
word_count_al_clean = word_count_al.filter(lambda x: x[0] not in common_words)
word_count_al_clean.count()
for x in word_count_al_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [25]:
word_count_gw_clean = word_count_gw.filter(lambda x: x[0] not in common_words)
word_count_gw_clean.count()
for x in word_count_gw_clean.sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [26]:
for x in word_count_bo_2015_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [27]:
for x in word_count_bo_clean.subtractByKey(word_count_gw_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [28]:
for x in word_count_gw_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [29]:
for x in word_count_fdr_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [30]:
for x in word_count_bo_clean.subtractByKey(word_count_fdr_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [31]:
for x in word_count_bo_clean.subtractByKey(word_count_wjc_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [32]:
for x in word_count_bo_clean.subtractByKey(word_count_gwb_clean).sortBy(lambda x: x[1],ascending=False).take(15):
print x
In [33]:
for x in word_count_bo_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
print x
In [34]:
for x in word_count_wjc_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
print x
In [35]:
for x in word_count_gwb_clean.subtractByKey(word_count_al_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
print x
In [36]:
for x in word_count_al_clean.subtractByKey(word_count_bo_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
print x
In [37]:
for x in word_count_al_clean.subtractByKey(word_count_wjc_clean).sortBy(lambda x: x[1],ascending=False).take(15): #collect():
print x
In [ ]: