In [1]:
import re
from operator import add
file_in = sc.textFile('/home/jjgarcia/clase/spark/quijote.txt')
print('number of lines in file: %s' % file_in.count())
In [2]:
chars = file_in.map(lambda s: len(s)).reduce(add)
print('number of characters in file: %s' % chars)
In [2]:
# Get words from the input file
words =file_in.flatMap(lambda line: re.split('\W+', line.lower().strip()))
# words of more than 3 characters
words = words.filter(lambda x: len(x) > 3)
# set count 1 per word
words = words.map(lambda w: (w,1))
# reduce phase - sum count all the words
words = words.reduceByKey(add)
# create tuple (count, word) and sort in descending
words = words.map(lambda x: (x[1], x[0])).sortByKey(False)
# take top 20 words by frequency
words.take(20)
Out[2]:
In [3]:
# create function for histogram of most frequent words
% matplotlib inline
import matplotlib.pyplot as plt
plt.rcdefaults()
def histogram(words):
count = map(lambda x: x[1], words)
word = map(lambda x: x[0], words)
plt.barh(range(len(count)), count,alpha=0.4)
plt.yticks(range(len(count)), word)
# Change order of tuple (word, count) from (count, word)
words = words.map(lambda x:(x[1], x[0]))
words.take(25)
# display histogram
#plt.figure(figsize=(12, 14))
histogram(words.take(25))
plt.show()
In [ ]:
In [ ]: