In [1]:
from operator import add
import os.path
import re
from pyspark import SparkContext

In [38]:
# set path and infile
home = os.path.expanduser('~')
bill_path = os.path.join('polatt', 'data', 'hr_2.txt')
stop_path = os.path.join('polatt', 'data', 'stopwords.txt')
infile = os.path.join(home, bill_path)
stopfile = os.path.join(home, stop_path)

In [6]:
# only need the below line if running as script
# i.e. > spark-submit count.py
# otherwise PySpark builds the context automatically on load
#sc = SparkContext("local", "Bill Count")

In [20]:
def wordCount(wordListRDD):
    return (wordListRDD
            .map(lambda x: (x, 1))
            .reduceByKey(add))

In [57]:
def removePunctuation(text):
    text = re.sub(r'[^\w+\s]','',text)
    return text.lower().strip()

In [58]:
# read in stopwords
stopwords = set(sc
                .textFile(stopfile)
                .collect())

# read in words
wordsRDD = (sc
            .textFile(infile, 8)
            .map(removePunctuation)
            .cache())

# check that we have some text
print '\n'.join(wordsRDD
                .zipWithIndex()
                .map(lambda (l, num): '{0}: {1}'.format(num, l))
                .take(15))


0: congressional bills 114th congress
1: from the us government printing office
2: hr 2 introduced in house ih
3: 
4: 114th congress
5: 1st session
6: h r 2
7: 
8: to amend title xviii of the social security act to repeal the medicare
9: sustainable growth rate and strengthen medicare access by improving
10: physician payments and making other improvements to reauthorize the
11: childrens health insurance program and for other purposes
12: 
13: 
14: _______________________________________________________________________

In [59]:
# get all words and count
wordsRDD = (wordsRDD
            .flatMap(lambda x: x.split(' '))
            .filter(lambda x: len(x) > 0 and x not in stopwords))
twords = 20
topWords = wordCount(wordsRDD).takeOrdered(twords, key=lambda x: -x[1])
print "\nWord count is: %i" % wordsRDD.count()
print "\nTops %i Words:" % twords
print '\n'.join(map(lambda (w, c): '{0} :: {1}'.format(w, c), topWords))


Word count is: 17537

Tops 20 Words:
year :: 344
secretary :: 338
services :: 228
1 :: 224
performance :: 196
fiscal :: 186
eligible :: 186
security :: 172
42 :: 166
payment :: 165
social :: 165
medicare :: 162
inserting :: 155
professional :: 154
period :: 146
mips :: 143
respect :: 140
health :: 131
amended :: 130
striking :: 128

In [60]:
def simpleTokenize(string):
    if (string == ' ') or (string is None):
        return []
    else:
        string = re.sub(split_regex,' ',string)
        return string.lower().strip().split(' ')

In [61]:
def tokenize(string):
    token_list = simpleTokenize(string)
    tokens = [x for x in token_list if x not in stopwords]
    return tokens

In [ ]: