In [ ]:
#! /usr/bin/env python

import string
from email.utils import parseaddr

for_split = [',', '\n', '\t', '\'', '.', '\"', '!', '?', '-', '~', '[', ']', 
        '=', '(', ')', '\"', ':', ';', '{', '}', '<', '>']
ignored = ['Re:', 'the', 'and', 'i', 'to', 'of', 'a', 'in', 'was', 'that', 'had',
        'he', 'you', 'his','my', 'it', 'as', 'with', 'her', 'for', 'on']

@outputSchema("y:bag{t:tuple(word:chararray, wordcount:int)}")
def getTop5Words(bag):
    result = []
    i = 0
    wordcount = {}
    for record in bag:
        doc = record[0]
        for ch in for_split:
            doc = string.replace(doc, ch, ' ')
        for word in [w.lower() for w in string.split(doc)]:
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1
    for word in sorted(wordcount, key=wordcount.get, reverse=True):
        if not word in ignored and i < 5 and wordcount[word] > 1:
            tup = (word, wordcount[word])
            result.append(tup)
            i = i + 1
    return result

@outputSchema("fromEmail:chararray")
def getFromEmail(fromEmail):
    return parseaddr(fromEmail)[1]