Word Count



In [ ]:

    
with open('../data/hamlet.txt','r') as infile:
    hamlet = infile.read()

Standard Python



In [ ]:

    
import re

words = re.split('\W+', hamlet.lower().strip())
print words[:10]



In [ ]:

    
words = filter(lambda x: len(x)>2, words)
print words[:10]



In [ ]:

    
wc = dict()

def add_to_dict(word):
    wc[word] = wc.get(word, 0) + 1

# map the words
map(add_to_dict, words)

#sort by frequency
top_words = sorted(wc.items(), key=lambda x: x[1], reverse=True)[:15]



In [ ]:

    
top_words



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt

def plot(words):
    values = map(lambda x: x[1], words)
    labels = map(lambda x: x[0], words)
    plt.barh(range(len(values)), values, color='grey')
    plt.yticks(range(len(values)), labels)
    plt.show()



In [ ]:

    
plot(top_words)

Spark



In [ ]:

    
import findspark
import os
findspark.init() # you need that before import pyspark.

import pyspark
sc = pyspark.SparkContext('local[2]', 'pyspark')



In [ ]:

    
words = sc.textFile('../data/hamlet.txt')
words.take(5)



In [ ]:

    
hamlet = words.flatMap(lambda line: re.split('\W+', line.lower().strip()))
hamlet.take(5)



In [ ]:

    
tmp = hamlet.filter(lambda x: len(x) > 2 )
print tmp.take(5)



In [ ]:

    
tmp = tmp.map(lambda word: (word, 1))
tmp.take(5)



In [ ]:

    
tmp = tmp.reduceByKey(lambda a, b: a + b)
tmp.take(5)



In [ ]:

    
tmp = tmp.map(lambda x: (x[1], x[0])).sortByKey(False)
tmp.take(20)



In [ ]:

    
tmp = tmp.map(lambda x: (x[1], x[0]))
tmp.take(20)



In [ ]:

    
plot(tmp.take(15))



In [ ]:

    
plot(top_words)

Summary code



In [ ]:

    
words = sc.textFile('../data/hamlet.txt')\
        .flatMap(lambda line: re.split('\W+', line.lower().strip()))\
        .filter(lambda x: len(x) > 2 )\
        .map(lambda word: (word, 1))\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda x: (x[1], x[0])).sortByKey(False)   

words.take(15)

Text filtering



In [ ]:

    
hamlet = '../data/hamlet.txt'
words = sc.textFile(hamlet)

How many lines contain hamlet?



In [ ]:

    
tmp = words.filter(lambda x: "hamlet" in x.lower())
tmp.count()

As a percent?



In [ ]:

    
'{0:0.2f}%'.format(100*tmp.count()/float(words.count()))

How about claudius and hamlet?



In [ ]:

    
tmp.filter( lambda x: 'claudius' in x.lower()).count()



In [ ]:

    
tmp.filter( lambda x: 'claudius' in x.lower()).collect()



In [ ]: