notebook.community

Edit and run



In [18]:

    
import nltk
from nltk.corpus import wordnet as wn
from collections import OrderedDict
from multiprocessing import Pool

# TODO move all of this to regular python script

f = open("cryptonomicon_engl.txt_Ascii.txt", "r", encoding="latin-1")

# chapters is a dict with keys that are chapter names and values that are chapter's content
# chapters in a book are supposed to be separated by a line with a "chapter " keyword in it
chapters = OrderedDict()
chapter = ''
chapter_title = ''

line = f.readline().lower()
while line:
    # next chapter
    if 'chapter ' in line:
        chapter_title = line
        chapters[chapter_title] = chapter
        chapter = ''
    
    chapter += line    
    line = f.readline().lower()
    
# TODO: do the same with "from wordnik import *"



In [19]:

    
def tokenize_chapter(title, text):
    """
    Tokenize all chapters using NLTK
    """
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [w.lower() for w in tokens if w.isalpha()]
    pos_tokens = nltk.pos_tag(tokens)
    return (title, pos_tokens)

# Parallel(n_jobs = multiprocessing.cpu_count())(delayed(tokenized_chapters)(title, text) for title, text in chapters.items())
pool = Pool()
result = pool.starmap(tokenize_chapter, [[title, text] for title, text in chapters.items()])

for t in result:
    chapters[t[0]] = t[1]



In [23]:

    
def pos_definition(tokenized_chapters, chapters):
    """
    Creates a dictionary of book's chapters, containing nested dictionary with definitions of
    least frequent adjectives ('adj' key), verbs ('verbs' key) and nouns ('nouns' key)
    """
    for title, tokens in chapters.items():
        adjectives = nouns = verbs = []
        
        # split tokens by POS
        for word, token in tokens:
            if token[0] == "J":
                adjectives.append((word, token))
            elif token[0] == "N":
                nouns.append((word, token))
            elif token[0] == "V":
                verbs.append((word, token))
        
        # TODO: reduce to one loop over pos type
        least_common_adj = transform_token_list(adjectives, wn.ADJ, 0.04)
        least_common_verb = transform_token_list(verbs, wn.VERB, 0.04)
        least_common_noun = transform_token_list(verbs, wn.NOUN, 0.04)

        adj_dict = create_definition_dict(least_common_adj)
        verb_dict = create_definition_dict(least_common_verb)
        noun_dict = create_definition_dict(least_common_noun)
        
        result = {}
        result["adjs"] = adj_dict
        result["verbs"] = verb_dict
        result["nouns"] = noun_dict
        
        tokenized_chapters[title] = result

def transform_token_list(token_list, pos_value, cutoff):
    """
    Deals with necessary data transformations: drops NA values, 
    flattens nested lists and selects least frequent items based on cutoff
    """
    token_list_freq = nltk.FreqDist(token_list)

    token_list_keys = token_list_freq.keys()
    tokens = [wn.synsets(w, pos = pos_value) for w, t in token_list_keys]

    # drop na
    tokens = list(filter(None, tokens))

    # flatten list
    tokens = [item for sublist in tokens for item in sublist]
    tokens = [(item.lemma_names()[0], item.definition()) for item in tokens]

    fraction = int(len(tokens) * cutoff)
    return tokens[-fraction:]

def create_definition_dict(token_list):
    """
    Reduces several definitions of a token to one semicolon-separated definition
    """
    result = {}

    for token in token_list:
        if token[0] not in result:
            result[token[0]] = token[1]
        else:
            result[token[0]] += "; " + token[1]
    
    return result

# find least common adjectives for each chapter and add them to dict
tokenized_chapters = OrderedDict()
pos_definition(tokenized_chapters, chapters)



In [31]:

    
from pylatex import Document, Section, Subsection, Command, Itemize, Enumerate, Description, Command, Package
from pylatex.utils import italic, NoEscape

def create_subsection(section_name, section_code, doc, token_dict):
    """
    Creates Thesaurus subsection for specified POS
    """
    with doc.create(Subsection(section_name)):
            with doc.create(Description()) as desc:
                    for name, description in token_dict[section_code].items():
                        desc.add_item(name, description)

doc = Document("Thesaurus")

doc.packages.append(Package("hyperref"))
doc.append(Command("tableofcontents"))
doc.append(Command("clearpage"))

for title, token_dict in tokenized_chapters.items():
    with doc.create(Section(title)):
        create_subsection("Adjectives", "adjs", doc, token_dict)
        create_subsection("Nouns", "nouns", doc, token_dict)
        create_subsection("Verbs", "verbs", doc, token_dict)
                    

doc.generate_pdf("Thesaurus")









    



This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/Debian)
 restricted \write18 enabled.
entering extended mode
(./Thesaurus.tex
LaTeX2e <2011/06/27>
Babel <3.9h> and hyphenation patterns for 2 languages loaded.
(/usr/share/texlive/texmf-dist/tex/latex/base/article.cls
Document Class: article 2007/10/19 v1.4h Standard LaTeX document class
(/usr/share/texlive/texmf-dist/tex/latex/base/size10.clo))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/inputenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/utf8.def
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.dfu)
(/usr/share/texlive/texmf-dist/tex/latex/base/ot1enc.dfu)
(/usr/share/texlive/texmf-dist/tex/latex/base/omsenc.dfu)))
(/usr/share/texmf/tex/latex/lm/lmodern.sty)
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/base/fontenc.sty
(/usr/share/texlive/texmf-dist/tex/latex/base/t1enc.def))
(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty
(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.sty
(/usr/share/texlive/texmf-dist/tex/generic/oberdiek/hobsub-generic.sty))
(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/share/texlive/texmf-dist/tex/generic/ifxetex/ifxetex.sty)
(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/auxhook.sty)
(/usr/share/texlive/texmf-dist/tex/latex/oberdiek/kvoptions.sty)
(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def)
(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/hyperref.cfg)

! LaTeX Error: File `url.sty' not found.

Type X to quit or <RETURN> to proceed,
or enter new name. (Default extension: sty)

Enter file name: 
! Emergency stop.
<read *> 
         
l.5037 \let
           \HyOrg@url\url^^M
!  ==> Fatal error occurred, no output PDF file produced!
Transcript written on Thesaurus.log.







    



---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
<ipython-input-31-515ff25caea1> in <module>()
     24 
     25 
---> 26 doc.generate_pdf("Thesaurus")

/home/kernelmode/anaconda3/lib/python3.5/site-packages/pylatex/document.py in generate_pdf(self, filepath, clean, clean_tex, compiler, compiler_args, silent)
    167                 # For all other errors print the output and raise the error
    168                 print(e.output.decode())
--> 169                 raise(e)
    170             else:
    171                 if not silent:

/home/kernelmode/anaconda3/lib/python3.5/site-packages/pylatex/document.py in generate_pdf(self, filepath, clean, clean_tex, compiler, compiler_args, silent)
    155             try:
    156                 output = subprocess.check_output(command,
--> 157                                                  stderr=subprocess.STDOUT)
    158             except (OSError, IOError) as e:
    159                 # Use FileNotFoundError when python 2 is dropped

/home/kernelmode/anaconda3/lib/python3.5/subprocess.py in check_output(timeout, *popenargs, **kwargs)
    627 
    628     return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 629                **kwargs).stdout
    630 
    631 

/home/kernelmode/anaconda3/lib/python3.5/subprocess.py in run(input, timeout, check, *popenargs, **kwargs)
    709         if check and retcode:
    710             raise CalledProcessError(retcode, process.args,
--> 711                                      output=stdout, stderr=stderr)
    712     return CompletedProcess(process.args, retcode, stdout, stderr)
    713 

CalledProcessError: Command '['pdflatex', '--interaction=nonstopmode', 'Thesaurus.tex']' returned non-zero exit status 1



In [ ]: