In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import os
In [20]:
def get_sentences(corpus, lemmatize=True):
assert corpus in ['phi5', 'tlg']
p = PunktLanguageVars()
if corpus == 'phi5':
lang = 'latin'
filepaths = assemble_phi5_author_filepaths()
jv = JVReplacer()
sent_tokenizer = TokenizeSentence('latin')
lemmatizer = LemmaReplacer('latin')
stops = latin_stops
elif corpus == 'tlg':
lang = 'greek'
filepaths = assemble_tlg_author_filepaths()
sent_tokenizer = TokenizeSentence('greek')
lemmatizer = LemmaReplacer('greek')
stops = greek_stops
#filepaths = filepaths[:5] # for testing
sent_tokenizer = TokenizeSentence(lang)
for filepath in filepaths:
with open(filepath) as f:
text_raw = f.read()
if corpus == 'phi5':
text_clean = phi5_plaintext_cleanup(text_raw)
elif corpus == 'tlg':
text_clean = tlg_plaintext_cleanup(text_raw)
sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean) # sentence tokenize
sent_tokens = [s.lower() for s in sent_tokens_upper] # lowercase
for sent in sent_tokens: # tokenize words in sentences
sent_word_tokens = []
sent_word_tokens = p.word_tokenize(sent)
if corpus == 'phi5':
sent_word_tokens = [jv.replace(word) for word in sent_word_tokens]
sent_word_tokens_new = []
for word in sent_word_tokens: # remove punctuation (final period, commas, etc)
# begin cleanup for corpus
if corpus == 'phi5':
if word[-1] in ['.', '“']:
word_new = word[:-1]
sent_word_tokens_new.append(word_new)
elif word[0] =='“':
word_new = word[1:]
sent_word_tokens_new.append(word_new)
elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']:
continue
elif word in stops: # remove stops
continue
elif '˘' in word: # rm meter
continue
elif 'á' in word: # rm accents from vowels; find more graceful way of doing this
word_new = word.replace('á', 'a')
sent_word_tokens_new.append(word_new)
elif 'é' in word:
word_new = word.replace('é', 'e')
sent_word_tokens_new.append(word_new)
elif 'í' in word:
word_new = word.replace('í', 'i')
sent_word_tokens_new.append(word_new)
elif 'ó' in word: #! no 'ó' found in PHI5
word_new = word.replace('ó', 'o')
sent_word_tokens_new.append(word_new)
print('rmd vowel', word, word_new)
elif 'ú' in word:
word_new = word.replace('ú', 'u')
sent_word_tokens_new.append(word_new)
else:
sent_word_tokens_new.append(word)
elif corpus == 'tlg':
if word[-1] in ['.', '“']:
word_new = word[:-1]
sent_word_tokens_new.append(word_new)
elif word in stops: # remove stops
continue
elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', 'ʹ']:
continue
else:
sent_word_tokens_new.append(word)
sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1] # rm short words
sentence = [w for w in sent_word_tokens_new if w] # remove any empty words (created thru above cleanup)
# remove any empty sentences (created thru above cleanup)
if sentence:
if lemmatize:
# lemmatize sentences
yield lemmatizer.lemmatize(sentence)
else:
yield sentence
In [22]:
phi_sentences_lemma = get_sentences('phi5')
# note word2vec can take an iterator but not generator
model = Word2Vec(sentences=list(phi_sentences_lemma), size=100, window=5, min_count=5, workers=4)
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)
In [3]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi_lemma.model')
#model.save(model_path) # 26 MB
In [4]:
# to load:
model = Word2Vec.load(model_path)
In [5]:
model.most_similar('pars')
Out[5]:
In [6]:
model.most_similar('sum1')
Out[6]:
In [28]:
tlg_sentences_lemma = get_sentences('tlg')
model = Word2Vec(sentences=list(tlg_sentences_lemma), size=100, window=5, min_count=5, workers=4)
In [8]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_tlg_lemma.model') # 64M
#model.save(model_path)
In [9]:
# to load:
model = Word2Vec.load(model_path)
In [10]:
model.most_similar('εἰμί')
Out[10]:
In [11]:
model.most_similar('λαμβάνω')
Out[11]:
In [12]:
model.most_similar('συνδέω')
Out[12]:
In [13]:
model.most_similar('ἐλεάω')
Out[13]:
In [ ]:
In [ ]: