In [1]:
%%writefile 5gram_test.txt
A BILL FOR ESTABLISHING RELIGIOUS	59	59	54
A Biography of General George	92	90	74
A Case Study in Government	102	102	78
A Case Study of Female	447	447	327
A Case Study of Limited	55	55	43
A Child's Christmas in Wales	1099	1061	866
A Circumstantial Narrative of the	62	62	50
A City by the Sea	62	60	49
A Collection of Fairy Tales	123	117	80
A Collection of Forms of	116	103	82


Overwriting 5gram_test.txt

Code to ingest data for large corpus


In [2]:
%%writefile GetIndexandOtherWords.py

import heapq
from re import findall
from mrjob.job import MRJob
from mrjob.step import MRStep


class TopList(list):
    def __init__(self, max_size, num_position=0):
        """
        Just like a list, except the append method adds the new value to the 
        list only if it is larger than the smallest value (or if the size of 
        the list is less than max_size). 
        
        If each element of the list is an int or float, uses that value for 
        comparison. If the elements in the list are lists or tuples, uses the 
        list_position element of the list or tuple for the comparison.
        """
        self.max_size = max_size
        self.pos = num_position
        
    def _get_key(self, x):
        return x[self.pos] if isinstance(x, (list, tuple)) else x
        
    def append(self, val):
        if len(self) < self.max_size:
            heapq.heappush(self, val)
        elif self._get_key(self[0]) < self._get_key(val):
            heapq.heapreplace(self, val)
            
    def final_sort(self):
        return sorted(self, key=self._get_key, reverse=True)

    
class GetIndexandOtherWords(MRJob):
    """
    Usage: python GetIndexandOtherWords.py --index-range 9000-10000 --top-n-words 10000 --use-term-counts True
    
    Given n-gram formatted data, outputs a file of the form:
    
    index    term
    index    term
    ...
    word     term
    word     term
    ...
    
    Where there would be 1001 index words and 10000 total words. Each word would be ranked based
    on either the term count listed in the Google n-gram data (i.e. the counts found in the
    underlying books) or the ranks would be based on the word count of the n-grams in the actual
    dataset (i.e. ignore the numbers/counts associated with each n-gram and count each n-gram
    exactly once).
    """
    def configure_options(self):
        super(GetIndexandOtherWords, self).configure_options()
        
        self.add_passthrough_option(
            '--index-range', 
            default="9-10", 
            help="Specify the range of the index words. ex. 9-10 means the ninth and " +
                 "tenth most popular words will serve as the index")
        
        self.add_passthrough_option(
            '--top-n-words', 
            default="10", 
            help="Specify the number of words to output in all")
        
        self.add_passthrough_option(
            '--use-term-counts', 
            default="True", 
            choices=["True","False"],
            help="When calculating the most frequent words, choose whether to count " + 
            "each word based on the term counts reported by Google or just based on " + 
            "the number of times the word appears in an n-gram")
        
        self.add_passthrough_option(
            '--return-counts', 
            default="False", 
            choices=["True","False"],
            help="The final output includes the counts of each word")
        
    def mapper_init(self):
        # Ensure command line options are sane
        top_n_words = int(self.options.top_n_words)
        last_index_word = int(self.options.index_range.split("-")[1])
        if top_n_words < last_index_word:
            raise ValueError("""--top-n-words value (currently %d) must be equal to or greater than
                             --index-range value (currently %d).""" % (top_n_words, last_index_word))
        
        self.stop_words =  {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
                            'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 
                            'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 
                            'its', 'itself', 'they', 'them', 'their', 'theirs', 
                            'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
                            'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
                            'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
                            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
                            'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
                            'with', 'about', 'against', 'between', 'into', 'through', 
                            'during', 'before', 'after', 'above', 'below', 'to', 'from', 
                            'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
                            'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                            'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 
                            'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 
                            'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 
                            'can', 'will', 'just', 'don', 'should', 'now'}
        
    def mapper(self, _, lines):
        terms, term_count, page_count, book_count = lines.split("\t")
        
        # Either use the ngram term count for the count or count each word just once
        if self.options.use_term_counts == "True":
            term_count = int(term_count)
        else:
            term_count = 1
        
        # Iterate through each term. Skip stop words
        for term in findall(r'[a-z]+', terms.lower()):
            if term in self.stop_words:
                pass
            else:
                yield (term, term_count)
        
    def combiner(self, term, counts):
        yield (term, sum(counts))      
        
        
    def reducer_init(self):
        """
        Accumulates the top X words and yields them. Note: should only use if
        you want to emit a reasonable amount of top words (i.e. an amount that 
        could fit on a single computer.)
        """
        self.top_n_words = int(self.options.top_n_words)
        self.TopTerms = TopList(self.top_n_words, num_position=1)
        
    def reducer(self, term, counts):
        self.TopTerms.append((term, sum(counts)))

    def reducer_final(self):
        for pair in self.TopTerms:
            yield pair
        
    def mapper_single_key(self, term, count):
        """
        Send all the data to a single reducer
        """
        yield (1, (term, count))
        
    def reducer_init_top_vals(self):
        # Collect top words
        self.top_n_words = int(self.options.top_n_words)
        self.TopTerms = TopList(self.top_n_words, num_position=1)
        # Collect index words
        self.index_range = [int(num) for num in self.options.index_range.split("-")]
        self.index_low, self.index_high = self.index_range
        # Control if output shows counts or just words
        self.return_counts = self.options.return_counts == "True"

    def reducer_top_vals(self, _, terms):
        for term in terms:
            self.TopTerms.append(term)
            
    def reducer_final_top_vals(self):
        TopTerms = self.TopTerms.final_sort()
        
        if self.return_counts:            
            # Yield index words
            for term in TopTerms[self.index_low-1:self.index_high]:
                yield ("index", term)

            # Yield all words
            for term in TopTerms:
                yield ("words", term)
        else:
            # Yield index words
            for term in TopTerms[self.index_low-1:self.index_high]:
                yield ("index", term[0])

            # Yield all words
            for term in TopTerms:
                yield ("words", term[0])
    
    def steps(self):
        """
        Step one: Yield top n-words from each reducer. Means dataset size is 
                  n-words * num_reducers. Guarantees overall top n-words are 
                  sent to the next step.
        
        """
        mr_steps = [MRStep(mapper_init=self.mapper_init,
                           mapper=self.mapper,
                           combiner=self.combiner,
                           reducer_init=self.reducer_init,
                           reducer_final=self.reducer_final,
                           reducer=self.reducer),
                    MRStep(mapper=self.mapper_single_key,
                           reducer_init=self.reducer_init_top_vals,
                           reducer=self.reducer_top_vals,
                           reducer_final=self.reducer_final_top_vals)]
        return mr_steps
        
if __name__ == "__main__":
    GetIndexandOtherWords.run()


Overwriting GetIndexandOtherWords.py

Examples


In [3]:
# Very close to what we will run on the full dataset. Returns whether the word is an index 
# term or just a normal word.
!cat 5gram_test.txt | python GetIndexandOtherWords.py --index-range 9-10     \
                                                      --top-n-words 10       \
                                                       -q


"index"	"circumstantial"
"index"	"establishing"
"words"	"child"
"words"	"christmas"
"words"	"wales"
"words"	"female"
"words"	"collection"
"words"	"fairy"
"words"	"government"
"words"	"city"
"words"	"circumstantial"
"words"	"establishing"

In [9]:
%%time
# Very close to what we will run on the full dataset. Returns whether the word is an index 
# term or just a normal word.
!cat google_ngram.txt | python GetIndexandOtherWords.py --index-range 9-10     \
                                                        --top-n-words 10       \
                                                        --return-counts True   \
                                                         -q


"index"	["country", 124759]
"index"	["daylight", 995]
"words"	["one", 1037502]
"words"	["would", 774062]
"words"	["time", 575697]
"words"	["two", 350503]
"words"	["people", 310694]
"words"	["years", 220380]
"words"	["day", 206294]
"words"	["united", 162056]
"words"	["country", 124759]
"words"	["daylight", 995]

In [7]:
# Returns the counts of each word as well
!cat 5gram_test.txt | python GetIndexandOtherWords.py --index-range 9-10     \
                                                      --top-n-words 10       \
                                                      --return-counts True   \
                                                      --use-term-counts False \
                                                       -q


"index"	["establishing", 1]
"index"	["fairy", 1]
"words"	["case", 3]
"words"	["study", 3]
"words"	["collection", 2]
"words"	["biography", 1]
"words"	["child", 1]
"words"	["christmas", 1]
"words"	["circumstantial", 1]
"words"	["city", 1]
"words"	["establishing", 1]
"words"	["fairy", 1]

In [4]:
# Returns the counts of each word as well
!cat 5gram_test.txt | python GetIndexandOtherWords.py --index-range 9-10     \
                                                      --top-n-words 10       \
                                                      --return-counts True   \
                                                       -q


"index"	["circumstantial", 62]
"index"	["establishing", 59]
"words"	["child", 1099]
"words"	["christmas", 1099]
"words"	["wales", 1099]
"words"	["female", 447]
"words"	["collection", 239]
"words"	["fairy", 123]
"words"	["government", 102]
"words"	["city", 62]
"words"	["circumstantial", 62]
"words"	["establishing", 59]

In [5]:
# Shows how to adjust the index_range argument
!cat 5gram_test.txt | python GetIndexandOtherWords.py --index-range 5-10     \
                                                      --top-n-words 10       \
                                                      --return-counts True   \
                                                       -q


"index"	["collection", 239]
"index"	["fairy", 123]
"index"	["government", 102]
"index"	["city", 62]
"index"	["circumstantial", 62]
"index"	["establishing", 59]
"words"	["child", 1099]
"words"	["christmas", 1099]
"words"	["wales", 1099]
"words"	["female", 447]
"words"	["collection", 239]
"words"	["fairy", 123]
"words"	["government", 102]
"words"	["city", 62]
"words"	["circumstantial", 62]
"words"	["establishing", 59]

In [6]:
# Top words can be set as large as you want without error. Uses all possible words
# if top-n-words > number of possible unique words in document.
!cat 5gram_test.txt | python GetIndexandOtherWords.py --index-range 5-10     \
                                                      --top-n-words 10000    \
                                                      --return-counts True   \
                                                       -q


"index"	["study", 604]
"index"	["female", 447]
"index"	["collection", 239]
"index"	["fairy", 123]
"index"	["tales", 123]
"index"	["forms", 116]
"words"	["child", 1099]
"words"	["christmas", 1099]
"words"	["wales", 1099]
"words"	["case", 604]
"words"	["study", 604]
"words"	["female", 447]
"words"	["collection", 239]
"words"	["fairy", 123]
"words"	["tales", 123]
"words"	["forms", 116]
"words"	["government", 102]
"words"	["biography", 92]
"words"	["general", 92]
"words"	["george", 92]
"words"	["circumstantial", 62]
"words"	["city", 62]
"words"	["narrative", 62]
"words"	["sea", 62]
"words"	["bill", 59]
"words"	["establishing", 59]
"words"	["religious", 59]
"words"	["limited", 55]