In [1]:
from operator import add
import os.path
import re
from pyspark import SparkContext
In [38]:
# set path and infile
home = os.path.expanduser('~')
bill_path = os.path.join('polatt', 'data', 'hr_2.txt')
stop_path = os.path.join('polatt', 'data', 'stopwords.txt')
infile = os.path.join(home, bill_path)
stopfile = os.path.join(home, stop_path)
In [6]:
# only need the below line if running as script
# i.e. > spark-submit count.py
# otherwise PySpark builds the context automatically on load
#sc = SparkContext("local", "Bill Count")
In [20]:
def wordCount(wordListRDD):
return (wordListRDD
.map(lambda x: (x, 1))
.reduceByKey(add))
In [57]:
def removePunctuation(text):
text = re.sub(r'[^\w+\s]','',text)
return text.lower().strip()
In [58]:
# read in stopwords
stopwords = set(sc
.textFile(stopfile)
.collect())
# read in words
wordsRDD = (sc
.textFile(infile, 8)
.map(removePunctuation)
.cache())
# check that we have some text
print '\n'.join(wordsRDD
.zipWithIndex()
.map(lambda (l, num): '{0}: {1}'.format(num, l))
.take(15))
In [59]:
# get all words and count
wordsRDD = (wordsRDD
.flatMap(lambda x: x.split(' '))
.filter(lambda x: len(x) > 0 and x not in stopwords))
twords = 20
topWords = wordCount(wordsRDD).takeOrdered(twords, key=lambda x: -x[1])
print "\nWord count is: %i" % wordsRDD.count()
print "\nTops %i Words:" % twords
print '\n'.join(map(lambda (w, c): '{0} :: {1}'.format(w, c), topWords))
In [60]:
def simpleTokenize(string):
if (string == ' ') or (string is None):
return []
else:
string = re.sub(split_regex,' ',string)
return string.lower().strip().split(' ')
In [61]:
def tokenize(string):
token_list = simpleTokenize(string)
tokens = [x for x in token_list if x not in stopwords]
return tokens
In [ ]: