In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stop.latin.stops import STOPS_LIST
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence

from gensim.models import Word2Vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %message)s', level=logging.INFO)

Prepare PHI sentences


In [2]:
filepaths = assemble_phi5_author_filepaths()
sent_tokenizer = TokenizeSentence('latin')
p = PunktLanguageVars()

In [89]:
phi_sentences = []
for filepath in filepaths:
    with open(filepath) as f:
        text_raw = f.read()
    text_clean = phi5_plaintext_cleanup(text_raw)  # phi5_plaintext_cleanup()
    sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean)  # sentence tokenize
    sent_tokens = [s.lower() for s in sent_tokens_upper]  # lowercase
    #sentence_tokens_author = []
    for sent in sent_tokens:  # tokenize words in sentences
        sent_word_tokens = []
        sent_word_tokens = p.word_tokenize(sent)
        sent_word_tokens_new = []
        for word in sent_word_tokens:  # remove punctuation (final period, commas, etc)
            if word[-1] in ['.', '“']:
                word_new = word[:-1]
                sent_word_tokens_new.append(word_new)
            elif word[0] =='“':
                word_new = word[1:]
                sent_word_tokens_new.append(word_new)
            elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']:
                continue
            elif word in STOPS_LIST:  # remove stops
                continue
            elif '˘' in word:  # rm meter
                continue
            elif 'á' in word:  # rm accents from vowels; find more graceful way of doing this
                word_new = word.replace('á', 'a')
                sent_word_tokens_new.append(word_new)
            elif 'é' in word:
                word_new = word.replace('é', 'e')
                sent_word_tokens_new.append(word_new)
            elif 'í' in word:
                word_new = word.replace('í', 'i')
                sent_word_tokens_new.append(word_new)
            elif 'ó' in word: #! no 'ó' found in PHI5
                word_new = word.replace('ó', 'o')
                sent_word_tokens_new.append(word_new)
                print('rmd vowel', word, word_new)
            elif 'ú' in word:
                word_new = word.replace('ú', 'u')
                sent_word_tokens_new.append(word_new)
            else:
                sent_word_tokens_new.append(word)
        sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1]  # rm short words

        sentence = [w for w in sent_word_tokens_new if w]  # remove any empty words (created thru above cleanup)
        if sentence:  # remove any empty sentences (created thru above cleanup)
            phi_sentences.append(sentence)

In [92]:
print(phi_sentences[:5])
print('Total sentences:', len(phi_sentences))


[['calata', 'comitia'], ['curiata'], ['centuriata', 'uniuersum', 'populum', 'partem', 'aliquam', 'adesse', 'iubet', 'comitia', 'concilium', 'edicere', 'debet'], ['tribuni', 'aduocant', 'patricios', 'eos', 'referre', 'ulla', 're', 'possunt'], ['leges', 'proprie', 'plebis', 'scita', 'appellantur', 'tribunis', 'plebis', 'ferentibus', 'accepta', 'sunt']]
Total sentences: 483762

Train model


In [124]:
model = Word2Vec(sentences=phi_sentences, size=100, window=5, min_count=5, workers=4)
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)

In [3]:
import os
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi.model')

In [105]:
model.save(model_path)  # 84 MB

In [4]:
# to load:
model = Word2Vec.load(model_path)

Fun with word2vec


In [107]:
model.most_similar('memor')


Out[107]:
[('coniugis', 0.7045540809631348),
 ('inmemor', 0.6948878765106201),
 ('exequias', 0.6811513304710388),
 ('perfide', 0.6685100197792053),
 ('parentis', 0.6603140830993652),
 ("'quaenam", 0.657184898853302),
 ('miserere', 0.6534832715988159),
 ('genetrix', 0.6532148122787476),
 ('miserae', 0.6508104801177979),
 ('titulum', 0.6492555141448975)]

In [108]:
model.most_similar('amat')


Out[108]:
[('odit', 0.7035549283027649),
 ('dolet', 0.6746412515640259),
 ('flet', 0.6656931638717651),
 ('optat', 0.6644587516784668),
 ('demens', 0.649543285369873),
 ('durus', 0.6433272361755371),
 ('risit', 0.6408871412277222),
 ('admovet', 0.6402783393859863),
 ('infelix', 0.6373776197433472),
 ('care', 0.630918025970459)]

In [109]:
model.most_similar('aeneas')


Out[109]:
[('achillis', 0.7200091481208801),
 ('troia', 0.6933267712593079),
 ('achilles', 0.690603494644165),
 ('mars', 0.6878527402877808),
 ('aeneae', 0.6771496534347534),
 ('priami', 0.6710656881332397),
 ('iove', 0.6604468822479248),
 ('hector', 0.6573183536529541),
 ('saturnia', 0.6531522870063782),
 ('senior', 0.6517125368118286)]

In [110]:
model.most_similar('cura')


Out[110]:
[('gloria', 0.7287247776985168),
 ('fama', 0.6642335057258606),
 ('senectus', 0.6609640121459961),
 ('laus', 0.657171905040741),
 ('fiducia', 0.6522766351699829),
 ('lascivia', 0.6502325534820557),
 ('potentia', 0.6471846699714661),
 ('favor', 0.6339759826660156),
 ('quies', 0.6272529363632202),
 ('reverentia', 0.6216776371002197)]

In [5]:
# "puer" is to "pater" as "filia" is to ...?
model.most_similar(['filia', 'pater'], ['puer'], topn=3)  # 'should' be mater!


Out[5]:
[('felix', 0.6785154938697815),
 ('parens', 0.6661562919616699),
 ('quirine', 0.6641063690185547)]

In [113]:
# which word doesn't go with the others?
model.doesnt_match("filius pater mater canis".split())


Out[113]:
'canis'

In [119]:
model.similarity('pater', 'mater')


Out[119]:
0.60847165245365753

In [121]:
model.similarity('pater', 'canis')


Out[121]:
0.092073954706546265

In [123]:
model['hasta']


Out[123]:
array([-0.05744257,  0.22592203,  0.04018604, -0.07586946,  0.0119054 ,
        0.10377435,  0.01969249, -0.01103721,  0.30429131,  0.0545937 ,
       -0.25224891, -0.2993449 ,  0.00218131,  0.00991641, -0.03970297,
        0.09024904,  0.08637553,  0.1639163 ,  0.15967998,  0.1670309 ,
       -0.04222492, -0.14281853,  0.10452943, -0.30463064,  0.13105001,
       -0.04037181, -0.09128801,  0.24211241,  0.03005035, -0.11448155,
       -0.01809427,  0.06177646, -0.17334674,  0.19290391,  0.0890111 ,
       -0.11700562,  0.20461507, -0.02512585, -0.07106511, -0.13127086,
        0.17325069,  0.16714285, -0.05994416,  0.18736716,  0.14231586,
        0.12930287, -0.17272429,  0.05862473,  0.26044106, -0.12149894,
       -0.16043539, -0.19315961,  0.10559075,  0.02609053,  0.41204444,
       -0.40747839, -0.14203864, -0.22034764, -0.18120967,  0.17194615,
        0.04295829,  0.02291438,  0.20852986,  0.23511888,  0.21593477,
        0.05037568, -0.02951043,  0.02491214, -0.01039343,  0.15004307,
       -0.07241164,  0.2227497 ,  0.21617855, -0.08113196,  0.06834389,
       -0.00569904, -0.2698507 ,  0.14832841,  0.29609945,  0.15997805,
        0.01945854,  0.27701685, -0.20912355, -0.05700139,  0.03779659,
       -0.32126781, -0.22996877,  0.02166177,  0.00516871, -0.0657784 ,
       -0.09869225,  0.11653145, -0.03409592, -0.10866409, -0.13455081,
       -0.12297951, -0.12116989, -0.03195545, -0.23835574, -0.0141314 ], dtype=float32)

In [ ]: