In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import os

Prepare PHI sentences


In [20]:
def get_sentences(corpus, lemmatize=True):
    assert corpus in ['phi5', 'tlg']
    p = PunktLanguageVars()
    if corpus == 'phi5':
        lang = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv = JVReplacer()
        sent_tokenizer = TokenizeSentence('latin')
        lemmatizer = LemmaReplacer('latin')
        stops = latin_stops
    elif corpus == 'tlg':
        lang = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        sent_tokenizer = TokenizeSentence('greek')
        lemmatizer = LemmaReplacer('greek')
        stops = greek_stops
    #filepaths = filepaths[:5]  # for testing
    sent_tokenizer = TokenizeSentence(lang)
    for filepath in filepaths:   
        with open(filepath) as f:
            text_raw = f.read()
        if corpus == 'phi5':
            text_clean = phi5_plaintext_cleanup(text_raw)
        elif corpus == 'tlg':
            text_clean = tlg_plaintext_cleanup(text_raw)
        sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean)  # sentence tokenize
        sent_tokens = [s.lower() for s in sent_tokens_upper]  # lowercase
        for sent in sent_tokens:  # tokenize words in sentences
            sent_word_tokens = []
            sent_word_tokens = p.word_tokenize(sent)
            if corpus == 'phi5':
                sent_word_tokens = [jv.replace(word) for word in sent_word_tokens]
            sent_word_tokens_new = []
            for word in sent_word_tokens:  # remove punctuation (final period, commas, etc)
                # begin cleanup for corpus
                if corpus == 'phi5':
                    if word[-1] in ['.', '“']:
                        word_new = word[:-1]
                        sent_word_tokens_new.append(word_new)
                    elif word[0] =='“':
                        word_new = word[1:]
                        sent_word_tokens_new.append(word_new)
                    elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']:
                        continue
                    elif word in stops:  # remove stops
                        continue
                    elif '˘' in word:  # rm meter
                        continue
                    elif 'á' in word:  # rm accents from vowels; find more graceful way of doing this
                        word_new = word.replace('á', 'a')
                        sent_word_tokens_new.append(word_new)
                    elif 'é' in word:
                        word_new = word.replace('é', 'e')
                        sent_word_tokens_new.append(word_new)
                    elif 'í' in word:
                        word_new = word.replace('í', 'i')
                        sent_word_tokens_new.append(word_new)
                    elif 'ó' in word: #! no 'ó' found in PHI5
                        word_new = word.replace('ó', 'o')
                        sent_word_tokens_new.append(word_new)
                        print('rmd vowel', word, word_new)
                    elif 'ú' in word:
                        word_new = word.replace('ú', 'u')
                        sent_word_tokens_new.append(word_new)
                    else:
                        sent_word_tokens_new.append(word)
                elif corpus == 'tlg':
                    if word[-1] in ['.', '“']:
                        word_new = word[:-1]
                        sent_word_tokens_new.append(word_new)
                    elif word in stops:  # remove stops
                        continue
                    elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', 'ʹ']:
                        continue
                    else:
                        sent_word_tokens_new.append(word)

            sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1]  # rm short words

            sentence = [w for w in sent_word_tokens_new if w]  # remove any empty words (created thru above cleanup)
            # remove any empty sentences (created thru above cleanup)
            if sentence:
                if lemmatize:
                    # lemmatize sentences
                    yield lemmatizer.lemmatize(sentence)
                else:
                    yield sentence

Train model, Latin lemmatized


In [22]:
phi_sentences_lemma = get_sentences('phi5')

# note word2vec can take an iterator but not generator
model = Word2Vec(sentences=list(phi_sentences_lemma), size=100, window=5, min_count=5, workers=4)
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)


INFO:CLTK:Loading lemmata. This may take a minute.

In [3]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi_lemma.model')
#model.save(model_path)  # 26 MB

In [4]:
# to load:
model = Word2Vec.load(model_path)

In [5]:
model.most_similar('pars')


Out[5]:
[('pario2', 0.8047900199890137),
 ('plinthus', 0.5860334634780884),
 ('cymatium', 0.5752122402191162),
 ('unus', 0.574550986289978),
 ('duo', 0.5710355043411255),
 ('impages', 0.5561530590057373),
 ('interscapilium', 0.5291818380355835),
 ('divido', 0.5206570625305176),
 ('tres', 0.5156294107437134),
 ('circulus', 0.5138579607009888)]

In [6]:
model.most_similar('sum1')


Out[6]:
[('edo1', 0.6984066367149353),
 ('habeo', 0.5768795013427734),
 ('hic', 0.5717377662658691),
 ('is', 0.5663044452667236),
 ('ille', 0.5483741760253906),
 ('facio', 0.545281171798706),
 ('qui1', 0.5408424735069275),
 ('desperauerunt', 0.47374892234802246),
 ('dico2', 0.4732477068901062),
 ('video', 0.4655439257621765)]

Train model, Greek lemmatized


In [28]:
tlg_sentences_lemma = get_sentences('tlg')

model = Word2Vec(sentences=list(tlg_sentences_lemma), size=100, window=5, min_count=5, workers=4)


INFO:CLTK:Loading lemmata. This may take a minute.

In [8]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_tlg_lemma.model')  # 64M
#model.save(model_path)

In [9]:
# to load:
model = Word2Vec.load(model_path)

In [10]:
model.most_similar('εἰμί')


Out[10]:
[('ἐστὶν', 0.6621981859207153),
 ('ἐστὶ', 0.633944034576416),
 ('ἐστίν', 0.5510233044624329),
 ('ἀπετελέσθη', 0.5382561087608337),
 ('προϋπῆρχε', 0.5373127460479736),
 ('γέγονεν', 0.5287182331085205),
 ('ἐστί', 0.5228145718574524),
 ('σπαρέν', 0.5213673114776611),
 ('ὁμοφυὴς', 0.516243577003479),
 ('πέφηνεν', 0.5157825946807861)]

In [11]:
model.most_similar('λαμβάνω')


Out[11]:
[('λαμβάνει', 0.7046980857849121),
 ('δίδωμι', 0.6249945163726807),
 ('λαμβάνειν', 0.6200978755950928),
 ('δείκνυμι', 0.6011672019958496),
 ('λάβοι', 0.5923374891281128),
 ('λαβὼν', 0.583453893661499),
 ('λαμβάνων', 0.5456539392471313),
 ('ἀναιρέω', 0.5438410043716431),
 ('οὐκέτι', 0.5282769203186035),
 ('ἀπέδωκεν', 0.5212222337722778)]

In [12]:
model.most_similar('συνδέω')


Out[12]:
[('σύνδεσμέω', 0.6962194442749023),
 ('διαπλανάω', 0.648327112197876),
 ('μερικωτέραις', 0.6285176873207092),
 ('ὀρεγόμεναι', 0.6168799996376038),
 ('χρήσεσι', 0.6121833324432373),
 ('εἰληχόσι', 0.6054438948631287),
 ('ἰδιάζουσι', 0.6051292419433594),
 ('συνδέδεται', 0.6021724939346313),
 ('στερούμενα', 0.600803017616272),
 ('ἀποτείνονται', 0.6003795862197876)]

In [13]:
model.most_similar('ἐλεάω')


Out[13]:
[('εὐεργετέω', 0.6935954093933105),
 ('οἰκτείρειν', 0.6903550624847412),
 ('εὔσπλαγχνος', 0.6548620462417603),
 ('ὑβρίζοντας', 0.6433862447738647),
 ('κατοικτείρει', 0.6393857002258301),
 ('ἁμαρτάνοντας', 0.6389428377151489),
 ('ἐλεήμονας', 0.6382932662963867),
 ('ὁμοδούλους', 0.6350147724151611),
 ('ἐχθραίνοντας', 0.632503092288971),
 ('μισέω', 0.6316449642181396)]

In [ ]:


In [ ]: