In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Phrases
import logging
from nltk.tokenize.punkt import PunktLanguageVars
import os
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def gen_sentences(corpus, lemmatize, rm_stops, testing):
    # TODO: Replace accented chars with un
    punkt = PunktLanguageVars()
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:100]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)

        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = punkt.word_tokenize(sentence)
            sentence = [s.lower() for s in sentence]

            sentence = [w for w in sentence if w]
            if stops:
                sentence = [w for w in sentence if w not in stops]
            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence
            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence != []:
                yield sentence

In [4]:
import time
start = time.time()
sents = gen_sentences(corpus='phi5', lemmatize=False, rm_stops=False, testing=True)
bigram = Phrases(sentences=list(sents), min_count=5)
print(time.time() - start)  # 1m15s / 100 docs


75.31758999824524

In [6]:
bigram[['credo', 'hercle', 'helluo', 'tuburcinatur']]  # ['credo_hercle', 'helluo']


Out[6]:
['credo_hercle', 'helluo']

In [ ]: