In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from collections import Counter
from itertools import groupby
from nltk.tokenize.punkt import PunktLanguageVars
from operator import itemgetter
import os
In [2]:
def get_lemmata(corpus):
assert corpus in ['phi5', 'tlg']
if corpus == 'phi5':
files_list = assemble_phi5_author_filepaths()
lemmatizer = LemmaReplacer('latin')
elif corpus == 'tlg':
files_list = assemble_tlg_author_filepaths()
lemmatizer = LemmaReplacer('greek')
j = JVReplacer()
p = PunktLanguageVars()
#all_tokens = []
#files_list = files_list[:5] # for testing
for path in files_list:
#tokens = []
with open(path) as f:
raw_text = f.read()
if corpus == 'phi5':
text = phi5_plaintext_cleanup(raw_text)
elif corpus == 'tlg':
text = tlg_plaintext_cleanup(raw_text)
chars = [chars for chars in text if chars not in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '#', '%', '(', ')', '/', '&', '“', '”']]
text = ''.join(chars)
tokens = p.word_tokenize(text.lower())
tokens = [j.replace(word) for word in tokens]
lemmata = [lemmatizer.lemmatize(x)[0] for x in tokens]
yield lemmata
In [3]:
lemmata = get_lemmata('phi5')
lemmata_list = []
for x in lemmata:
lemmata_list += x
count_lemmata = Counter(lemmata_list)
lemma_mc = count_lemmata.most_common(10000)
lemma_file_rel = '~/cltk_data/user_data/latin_lemma_most_common.txt'
lemma_file = os.path.expanduser(lemma_file_rel)
# empty file if exists
if os.path.exists(lemma_file):
with open(lemma_file, 'w') as fo:
fo.write('')
with open(lemma_file, 'a') as fo:
for pair in lemma_mc:
lemma = pair[0]
count = pair[1]
line = lemma + '\t' + str(count) + '\n'
fo.write(line)
In [4]:
lemmata = get_lemmata('tlg')
lemmata_list = []
for x in lemmata:
lemmata_list += x
count_lemmata = Counter(lemmata_list)
lemma_mc = count_lemmata.most_common(10000)
lemma_file_rel = '~/cltk_data/user_data/greek_lemma_most_common.txt'
lemma_file = os.path.expanduser(lemma_file_rel)
# empty file if exists
if os.path.exists(lemma_file):
with open(lemma_file, 'w') as fo:
fo.write('')
with open(lemma_file, 'a') as fo:
for pair in lemma_mc:
lemma = pair[0]
count = pair[1]
line = lemma + '\t' + str(count) + '\n'
fo.write(line)