Word Count


In [ ]:
with open('../data/hamlet.txt','r') as infile:
    hamlet = infile.read()

Standard Python


In [ ]:
import re

words = re.split('\W+', hamlet.lower().strip())
print words[:10]

In [ ]:
words = filter(lambda x: len(x)>2, words)
print words[:10]

In [ ]:
wc = dict()

def add_to_dict(word):
    wc[word] = wc.get(word, 0) + 1

# map the words
map(add_to_dict, words)

#sort by frequency
top_words = sorted(wc.items(), key=lambda x: x[1], reverse=True)[:15]

In [ ]:
top_words

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

def plot(words):
    values = map(lambda x: x[1], words)
    labels = map(lambda x: x[0], words)
    plt.barh(range(len(values)), values, color='grey')
    plt.yticks(range(len(values)), labels)
    plt.show()

In [ ]:
plot(top_words)

Spark


In [ ]:
import findspark
import os
findspark.init() # you need that before import pyspark.

import pyspark
sc = pyspark.SparkContext('local[2]', 'pyspark')

In [ ]:
words = sc.textFile('../data/hamlet.txt')
words.take(5)

In [ ]:
hamlet = words.flatMap(lambda line: re.split('\W+', line.lower().strip()))
hamlet.take(5)

In [ ]:
tmp = hamlet.filter(lambda x: len(x) > 2 )
print tmp.take(5)

In [ ]:
tmp = tmp.map(lambda word: (word, 1))
tmp.take(5)

In [ ]:
tmp = tmp.reduceByKey(lambda a, b: a + b)
tmp.take(5)

In [ ]:
tmp = tmp.map(lambda x: (x[1], x[0])).sortByKey(False)
tmp.take(20)

In [ ]:
tmp = tmp.map(lambda x: (x[1], x[0]))
tmp.take(20)

In [ ]:
plot(tmp.take(15))

In [ ]:
plot(top_words)

Summary code


In [ ]:
words = sc.textFile('../data/hamlet.txt')\
        .flatMap(lambda line: re.split('\W+', line.lower().strip()))\
        .filter(lambda x: len(x) > 2 )\
        .map(lambda word: (word, 1))\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda x: (x[1], x[0])).sortByKey(False)   

words.take(15)

Text filtering


In [ ]:
hamlet = '../data/hamlet.txt'
words = sc.textFile(hamlet)

How many lines contain hamlet?


In [ ]:
tmp = words.filter(lambda x: "hamlet" in x.lower())
tmp.count()

As a percent?


In [ ]:
'{0:0.2f}%'.format(100*tmp.count()/float(words.count()))

How about claudius and hamlet?


In [ ]:
tmp.filter( lambda x: 'claudius' in x.lower()).count()

In [ ]:
tmp.filter( lambda x: 'claudius' in x.lower()).collect()

In [ ]: