notebook.community

Edit and run



In [1]:

    
corpus = sc.textFile("../data/pg1342.txt")
corpus.cache()
corpus.takeSample(True, 5)









    Out[1]:





[u'for a few weeks, and when accident separates them, so easily forgets',
 u'officer, sometimes another, had been her favourite, as their attentions',
 u"entirely deceived in Miss Bingley's regard for me. But, my dear sister,",
 u'',
 u'most humiliating picture! And to the pang of a friend disgracing herself']



In [2]:

    
word_split = corpus.flatMap(lambda line: line.split(" ")).filter(lambda element: len(element) > 0)
word_split.takeSample(True, 5)









    Out[2]:





[u'could', u'by', u'momentary', u'far', u'a']



In [3]:

    
mapped = word_split.map(lambda word: (word.lower(), 1))
mapped.takeSample(True, 5)









    Out[3]:





[(u'intimation', 1),
 (u'explaining', 1),
 (u'respect,', 1),
 (u'the', 1),
 (u'his', 1)]



In [4]:

    
reduced = mapped.reduceByKey(lambda a, b: a+b)
reduced.takeSample(True, 5)









    Out[4]:





[(u'proudest,', 1),
 (u'compliments', 9),
 (u'valuable.', 1),
 (u'slept', 1),
 (u"performer's", 1)]



In [5]:

    
swapped = reduced.map(lambda row: (row[1], row[0])).sortByKey(ascending=False)
swapped.take(10)









    Out[5]:





[(4479, u'the'),
 (4169, u'to'),
 (3680, u'of'),
 (3398, u'and'),
 (1982, u'a'),
 (1941, u'her'),
 (1890, u'in'),
 (1798, u'was'),
 (1740, u'i'),
 (1611, u'she')]



In [6]:

    
unswapped = swapped.map(lambda row: (row[1], row[0]))
unswapped.take(10)









    Out[6]:





[(u'the', 4479),
 (u'to', 4169),
 (u'of', 3680),
 (u'and', 3398),
 (u'a', 1982),
 (u'her', 1941),
 (u'in', 1890),
 (u'was', 1798),
 (u'i', 1740),
 (u'she', 1611)]



In [8]:

    
#concise implementation
from operator import add
swap = lambda row: (row[1], row[0])
words = sc.textFile("../data/pg1342.txt").flatMap(lambda line: line.split(" "))
words.cache()
counts = words.filter(lambda element: len(element) > 0).map(
  lambda word: (word, 1)).reduceByKey(add).map(swap).sortByKey(ascending=False).map(swap).collect()



In [15]:

    
counts[:10]









    Out[15]:





[(u'the', 4205),
 (u'to', 4121),
 (u'of', 3662),
 (u'and', 3309),
 (u'a', 1945),
 (u'her', 1858),
 (u'in', 1813),
 (u'was', 1796),
 (u'I', 1740),
 (u'that', 1419)]



In [ ]: