In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Phrases
import logging
from nltk.tokenize.punkt import PunktLanguageVars
import os
import time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
def gen_sentences(corpus, lemmatize, rm_stops, testing):
# TODO: Replace accented chars with un
punkt = PunktLanguageVars()
if corpus == 'phi5':
language = 'latin'
filepaths = assemble_phi5_author_filepaths()
jv_replacer = JVReplacer()
text_cleaner = phi5_plaintext_cleanup
if rm_stops:
stops = latin_stops
else:
stops = None
elif corpus == 'tlg':
language = 'greek'
filepaths = assemble_tlg_author_filepaths()
text_cleaner = tlg_plaintext_cleanup
if rm_stops:
stops = latin_stops
else:
stops = None
if lemmatize:
lemmatizer = LemmaReplacer(language)
if testing:
filepaths = filepaths[:100]
sent_tokenizer = TokenizeSentence(language)
for filepath in filepaths:
with open(filepath) as f:
text = f.read()
# light first-pass cleanup, before sentence tokenization (which relies on punctuation)
text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
sent_tokens = sent_tokenizer.tokenize_sentences(text)
for sentence in sent_tokens:
# a second cleanup at sentence-level, to rm all punctuation
sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
sentence = punkt.word_tokenize(sentence)
sentence = [s.lower() for s in sentence]
sentence = [w for w in sentence if w]
if stops:
sentence = [w for w in sentence if w not in stops]
sentence = [w for w in sentence if len(w) > 1] # rm short words
if sentence:
sentence = sentence
if lemmatize:
sentence = lemmatizer.lemmatize(sentence)
if sentence and language == 'latin':
sentence = [jv_replacer.replace(word) for word in sentence]
if sentence != []:
yield sentence
In [4]:
import time
start = time.time()
sents = gen_sentences(corpus='phi5', lemmatize=False, rm_stops=False, testing=True)
bigram = Phrases(sentences=list(sents), min_count=5)
print(time.time() - start) # 1m15s / 100 docs
In [6]:
bigram[['credo', 'hercle', 'helluo', 'tuburcinatur']] # ['credo_hercle', 'helluo']
Out[6]:
In [ ]: