In [ ]:
with open('../data/hamlet.txt','r') as infile:
hamlet = infile.read()
In [ ]:
import re
words = re.split('\W+', hamlet.lower().strip())
print words[:10]
In [ ]:
words = filter(lambda x: len(x)>2, words)
print words[:10]
In [ ]:
wc = dict()
def add_to_dict(word):
wc[word] = wc.get(word, 0) + 1
# map the words
map(add_to_dict, words)
#sort by frequency
top_words = sorted(wc.items(), key=lambda x: x[1], reverse=True)[:15]
In [ ]:
top_words
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
def plot(words):
values = map(lambda x: x[1], words)
labels = map(lambda x: x[0], words)
plt.barh(range(len(values)), values, color='grey')
plt.yticks(range(len(values)), labels)
plt.show()
In [ ]:
plot(top_words)
In [ ]:
import findspark
import os
findspark.init() # you need that before import pyspark.
import pyspark
sc = pyspark.SparkContext('local[2]', 'pyspark')
In [ ]:
words = sc.textFile('../data/hamlet.txt')
words.take(5)
In [ ]:
hamlet = words.flatMap(lambda line: re.split('\W+', line.lower().strip()))
hamlet.take(5)
In [ ]:
tmp = hamlet.filter(lambda x: len(x) > 2 )
print tmp.take(5)
In [ ]:
tmp = tmp.map(lambda word: (word, 1))
tmp.take(5)
In [ ]:
tmp = tmp.reduceByKey(lambda a, b: a + b)
tmp.take(5)
In [ ]:
tmp = tmp.map(lambda x: (x[1], x[0])).sortByKey(False)
tmp.take(20)
In [ ]:
tmp = tmp.map(lambda x: (x[1], x[0]))
tmp.take(20)
In [ ]:
plot(tmp.take(15))
In [ ]:
plot(top_words)
In [ ]:
words = sc.textFile('../data/hamlet.txt')\
.flatMap(lambda line: re.split('\W+', line.lower().strip()))\
.filter(lambda x: len(x) > 2 )\
.map(lambda word: (word, 1))\
.reduceByKey(lambda a, b: a + b)\
.map(lambda x: (x[1], x[0])).sortByKey(False)
words.take(15)
In [ ]:
hamlet = '../data/hamlet.txt'
words = sc.textFile(hamlet)
How many lines contain hamlet?
In [ ]:
tmp = words.filter(lambda x: "hamlet" in x.lower())
tmp.count()
As a percent?
In [ ]:
'{0:0.2f}%'.format(100*tmp.count()/float(words.count()))
How about claudius and hamlet?
In [ ]:
tmp.filter( lambda x: 'claudius' in x.lower()).count()
In [ ]:
tmp.filter( lambda x: 'claudius' in x.lower()).collect()
In [ ]: