In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from collections import Counter
from itertools import groupby
from nltk.tokenize.punkt import PunktLanguageVars
from operator import itemgetter
import os

In [2]:
def get_lemmata(corpus):
    assert corpus in ['phi5', 'tlg']
    if corpus == 'phi5':
        files_list = assemble_phi5_author_filepaths()
        lemmatizer = LemmaReplacer('latin')
    elif corpus == 'tlg':
        files_list = assemble_tlg_author_filepaths()
        lemmatizer = LemmaReplacer('greek')
    j = JVReplacer()
    p = PunktLanguageVars()
    #all_tokens = []
    #files_list = files_list[:5]  # for testing
    for path in files_list:
        #tokens = []
        with open(path) as f:
            raw_text = f.read()
        if corpus == 'phi5':
            text = phi5_plaintext_cleanup(raw_text)
        elif corpus == 'tlg':
            text = tlg_plaintext_cleanup(raw_text)
        chars = [chars for chars in text if chars not in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '#', '%', '(', ')', '/', '&', '“', '”']]
        text = ''.join(chars)
        tokens = p.word_tokenize(text.lower())
        tokens = [j.replace(word) for word in tokens]
        lemmata = [lemmatizer.lemmatize(x)[0] for x in tokens]
        yield lemmata

Latin


In [3]:
lemmata = get_lemmata('phi5')

lemmata_list = []
for x in lemmata:
    lemmata_list += x

count_lemmata = Counter(lemmata_list)
lemma_mc = count_lemmata.most_common(10000)

lemma_file_rel = '~/cltk_data/user_data/latin_lemma_most_common.txt'
lemma_file = os.path.expanduser(lemma_file_rel)

# empty file if exists
if os.path.exists(lemma_file):
    with open(lemma_file, 'w') as fo:
        fo.write('')

with open(lemma_file, 'a') as fo:
    for pair in lemma_mc:
        lemma = pair[0]
        count = pair[1]
        line = lemma + '\t' + str(count) + '\n'
        fo.write(line)


INFO:CLTK:Loading lemmata. This may take a minute.

Greek


In [4]:
lemmata = get_lemmata('tlg')

lemmata_list = []
for x in lemmata:
    lemmata_list += x

count_lemmata = Counter(lemmata_list)
lemma_mc = count_lemmata.most_common(10000)

lemma_file_rel = '~/cltk_data/user_data/greek_lemma_most_common.txt'
lemma_file = os.path.expanduser(lemma_file_rel)

# empty file if exists
if os.path.exists(lemma_file):
    with open(lemma_file, 'w') as fo:
        fo.write('')

with open(lemma_file, 'a') as fo:
    for pair in lemma_mc:
        lemma = pair[0]
        count = pair[1]
        line = lemma + '\t' + str(count) + '\n'
        fo.write(line)


INFO:CLTK:Loading lemmata. This may take a minute.