In [1]:
from cltk.corpus.utils.formatter import assemble_tlg_works_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stop.greek.stops import STOPS_LIST
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence

from gensim.models import Word2Vec

Prepare TLG sentences


In [2]:
filepaths = assemble_tlg_works_filepaths()
sent_tokenizer = TokenizeSentence('greek')
p = PunktLanguageVars()

In [22]:
tlg_sentences = []
count_file = 0
for filepath in filepaths:
    with open(filepath) as f:
        text_raw = f.read()
    text_clean = tlg_plaintext_cleanup(text_raw)  # tlg_plaintext_cleanup()
    sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean)  # sentence tokenize
    sent_tokens = [s.lower() for s in sent_tokens_upper]  # lowercase
    for sent in sent_tokens:  # tokenize words in sentences
        sent_word_tokens = []
        sent_word_tokens = p.word_tokenize(sent)
        sent_word_tokens_new = []
        for word in sent_word_tokens:  # remove punctuation (final period, commas, etc)
            if word[-1] in ['.', '“']:
                word_new = word[:-1]
                sent_word_tokens_new.append(word_new)
            else:
                sent_word_tokens_new.append(word)
        sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1]  # rm short words
        sentence = [w for w in sent_word_tokens_new if w]  # remove any empty words (created thru above cleanup)
        if sentence:  # remove any empty sentences (created thru above cleanup)
            tlg_sentences.append(sentence)
    count_file += 1
    if count_file % 500 == 0:
        print(count_file, '/', len(filepaths))


100 / 6625
200 / 6625
300 / 6625
400 / 6625
500 / 6625
600 / 6625
700 / 6625
800 / 6625
900 / 6625
1000 / 6625
1100 / 6625
1200 / 6625
1300 / 6625
1400 / 6625
1500 / 6625
1600 / 6625
1700 / 6625
1800 / 6625
1900 / 6625
2000 / 6625
2100 / 6625
2200 / 6625
2300 / 6625
2400 / 6625
2500 / 6625
2600 / 6625
2700 / 6625
2800 / 6625
2900 / 6625
3000 / 6625
3100 / 6625
3200 / 6625
3300 / 6625
3400 / 6625
3500 / 6625
3600 / 6625
3700 / 6625
3800 / 6625
3900 / 6625
4000 / 6625
4100 / 6625
4200 / 6625
4300 / 6625
4400 / 6625
4500 / 6625
4600 / 6625
4700 / 6625
4800 / 6625
4900 / 6625
5000 / 6625
5100 / 6625
5200 / 6625
5300 / 6625
5400 / 6625
5500 / 6625
5600 / 6625
5700 / 6625
5800 / 6625
5900 / 6625
6000 / 6625
6100 / 6625
6200 / 6625
6300 / 6625
6400 / 6625
6500 / 6625
6600 / 6625

In [23]:
import os
with open(os.path.expanduser('~/cltk_data/user_data/tlg_sentences.py'), 'w') as f:  # 1.0 GB
    f.write(str(tlg_sentences))

In [24]:
print(tlg_sentences[:5])
print('Total sentences:', len(tlg_sentences))


[['περὶ', 'δὴ', 'τῶν', 'ἑπτὰ', 'ἄξιον', 'γὰρ', 'ἐνταῦθα', 'καθολικῶς', 'κἀκείνων', 'ἐπιμνησθῆναι', 'λόγοι', 'φέρονται', 'τοιοῦτοι'], ['δάμων', 'κυρηναῖος', 'γεγραφὼς', 'περὶ', 'τῶν', 'φιλοσόφων', 'πᾶσιν', 'ἐγκαλεῖ', 'μάλιστα', 'δὲ', 'τοῖς', 'ἑπτά'], ['ἀναξιμένης', 'δέ', 'φησι', 'πάντας', 'ἐπιθέσθαι', 'ποιητικῆι', 'δὲ', 'δικαίαρχος'], ['οὔτε', 'σοφοὺς', 'οὔτε', 'φιλοσόφους', 'φησὶν', 'αὐτοὺς', 'γεγονέναι', 'συνετοὺς', 'δέ', 'τινας', 'καὶ', 'νομοθετικούς'], ['ἀρχέτιμος', 'δὲ', 'συρακούσιος', 'ὁμιλίαν', 'αὐτῶν', 'ἀναγέγραφε', 'παρὰ', 'κυψέλωι', 'ἧι', 'καὶ', 'αὐτός', 'φησι', 'παρατυχεῖν', 'ἔφορος']]
Total sentences: 3320169

Train model


In [25]:
model = Word2Vec(sentences=tlg_sentences, size=100, window=5, min_count=5, workers=4)  #! this took ~1.5 hrs
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)

In [26]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_tlg.model')

In [27]:
model.save(model_path)  # word2vec_tlg.model.syn0.npy: 155 MB; word2vec_tlg.model.syn1.npy: 155 MB; word2vec_tlg.model: 88MB

In [106]:
# to load:
model = Word2Vec.load(model_path)

Fun with word2vec


In [29]:
model.most_similar('ἔπεμπον')


Out[29]:
[('ἔπεμψαν', 0.8375612497329712),
 ('πέμψαντες', 0.821482241153717),
 ('πέμποντες', 0.7889137268066406),
 ('ἀπέστελλον', 0.7763285636901855),
 ('πρέσβεις', 0.7738817930221558),
 ('πέμπουσιν', 0.7730725407600403),
 ('ἐξέπεμπον', 0.7639554738998413),
 ('ἐξέπεμψαν', 0.7628260850906372),
 ('πέμπουσι', 0.7626725435256958),
 ('ἐξαπέστελλον', 0.7615334987640381)]

In [32]:
model.most_similar('ἄγγελος')


Out[32]:
[('μανωε', 0.696036159992218),
 ('ἀρχάγγελος', 0.6499038338661194),
 ('ζεβουλ', 0.6473389863967896),
 ('παρεμβαλεῖ', 0.6429287791252136),
 ('ἐπιφανήσεται', 0.641310453414917),
 ('γαβριὴλ', 0.6345863342285156),
 ('σαμαιαν', 0.6238977909088135),
 ('βαλααμ', 0.6236370801925659),
 ('ησαιας', 0.6227160692214966),
 ('σαμουηλ', 0.6202077865600586)]

In [38]:
model.most_similar('ἄχος')


Out[38]:
[('μένος', 0.770604133605957),
 ('ἄμπεχε', 0.7698976993560791),
 ('αἰνὸν', 0.7675420045852661),
 ('ἐλθέμεναι', 0.7659810185432434),
 ('δῦν', 0.7652841806411743),
 ('κῆρ', 0.7560437321662903),
 ('ὦκα', 0.7523142099380493),
 ('ἄμοτον', 0.7515565156936646),
 ('γόου', 0.7511271238327026),
 ('ὄρωρεν', 0.7479422092437744)]

In [39]:
model.most_similar('κῆρ')


Out[39]:
[('ποδώκεος', 0.8257793188095093),
 ('ὁμοίιον', 0.8225136995315552),
 ('ἀχιλῆϊ', 0.8209117650985718),
 ('χάρμης', 0.8179543614387512),
 ('δηριαδῆος', 0.8171504139900208),
 ('ὦκα', 0.8166021108627319),
 ('αἰακίδαο', 0.8149865865707397),
 ('τετλάτω', 0.8137754201889038),
 ('θοῦρον', 0.8128254413604736),
 ('ἀχνύμενοι', 0.8121742010116577)]

In [40]:
model.most_similar('ἄνδρα')


Out[40]:
[('γέροντα', 0.7098464965820312),
 ('παῖδα', 0.7086045742034912),
 ('ἀδελφὸν', 0.6956572532653809),
 ('οἰκέτην', 0.6936179399490356),
 ('νεανίαν', 0.6913819313049316),
 ('πρεσβύτην', 0.6908776760101318),
 ('νεανίσκον', 0.6900901794433594),
 ('ἐραστὴν', 0.6859878301620483),
 ('ἱερέα', 0.6836997270584106),
 ('νυμφίον', 0.679427981376648)]

In [42]:
model.most_similar('παῖδα')


Out[42]:
[('φονέα', 0.7779688239097595),
 ('οἰνοχόον', 0.7524860501289368),
 ('ἱππόλυτον', 0.7507445216178894),
 ('ἀδελφὸν', 0.7471684217453003),
 ('υἱόν', 0.7427107691764832),
 ('ἐρώμενον', 0.7391958236694336),
 ('χείρωνα', 0.7375092506408691),
 ('πᾶνα', 0.7372569441795349),
 ('τάφον', 0.7323480844497681),
 ('ἄδωνιν', 0.7319536805152893)]

In [43]:
model.most_similar('ἱερέα')


Out[43]:
[('προφήτην', 0.7774537205696106),
 ('οὐρίαν', 0.7593482136726379),
 ('μιχαίαν', 0.7458844184875488),
 ('ἐλισσαῖον', 0.7311204671859741),
 ('βαπτιστὴν', 0.7267971038818359),
 ('μανασσὴν', 0.7137241363525391),
 ('σολομῶντα', 0.7134689092636108),
 ('ἀρχιερέα', 0.7127511501312256),
 ('ναὸν', 0.7113758325576782),
 ('νομοθέτην', 0.710371196269989)]

In [52]:
model.most_similar('γυναικὸς')


Out[52]:
[('γυναικός', 0.862053632736206),
 ('μητρὸς', 0.7998234033584595),
 ('δούλης', 0.7781879901885986),
 ('ἀδελφῆς', 0.701331377029419),
 ('μοιχευομένης', 0.6862832903862),
 ('θυγατρὸς', 0.6827195882797241),
 ('νύμφης', 0.662971019744873),
 ('πόρνης', 0.6611018180847168),
 ('σάρρας', 0.6597551107406616),
 ('γαμετῆς', 0.6574646234512329)]

In [111]:
# "puer" is to "pater" as "filia" is to ...?
model.most_similar(['filia', 'pater'], ['puer'], topn=3)  # 'should' be mater!


Out[111]:
[('nepos', 0.6462836265563965),
 ('pronepos', 0.6400849223136902),
 ('patrui', 0.634131908416748)]

In [46]:
# which word doesn't go with the others?
model.doesnt_match("παῖδα ἄνδρα ναὸν".split())


Out[46]:
'ναὸν'

In [51]:
#model.similarity('γινώσκω', 'ἔχω')


Out[51]:
[('γυναικός', 0.862053632736206),
 ('μητρὸς', 0.7998234033584595),
 ('δούλης', 0.7781879901885986),
 ('ἀδελφῆς', 0.701331377029419),
 ('μοιχευομένης', 0.6862832903862),
 ('θυγατρὸς', 0.6827195882797241),
 ('νύμφης', 0.662971019744873),
 ('πόρνης', 0.6611018180847168),
 ('σάρρας', 0.6597551107406616),
 ('γαμετῆς', 0.6574646234512329)]

In [53]:
model.similarity('ἄνδρα', 'ἀνὴρ')  # strange?


Out[53]:
0.26308083896554618

In [54]:
model.similarity('ἄνδρα', 'ὄργανον')


Out[54]:
0.026751346469487833

In [55]:
model.similarity('ἀνὴρ', 'ὄργανον')


Out[55]:
-0.086610455019305616

In [56]:
model.similarity('ὄνομα', 'ὄργανον')


Out[56]:
0.30930986578023612

In [ ]: