In [1]:
from cltk.corpus.utils.formatter import assemble_tlg_works_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stop.greek.stops import STOPS_LIST
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
In [2]:
filepaths = assemble_tlg_works_filepaths()
sent_tokenizer = TokenizeSentence('greek')
p = PunktLanguageVars()
In [22]:
tlg_sentences = []
count_file = 0
for filepath in filepaths:
with open(filepath) as f:
text_raw = f.read()
text_clean = tlg_plaintext_cleanup(text_raw) # tlg_plaintext_cleanup()
sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean) # sentence tokenize
sent_tokens = [s.lower() for s in sent_tokens_upper] # lowercase
for sent in sent_tokens: # tokenize words in sentences
sent_word_tokens = []
sent_word_tokens = p.word_tokenize(sent)
sent_word_tokens_new = []
for word in sent_word_tokens: # remove punctuation (final period, commas, etc)
if word[-1] in ['.', '“']:
word_new = word[:-1]
sent_word_tokens_new.append(word_new)
else:
sent_word_tokens_new.append(word)
sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1] # rm short words
sentence = [w for w in sent_word_tokens_new if w] # remove any empty words (created thru above cleanup)
if sentence: # remove any empty sentences (created thru above cleanup)
tlg_sentences.append(sentence)
count_file += 1
if count_file % 500 == 0:
print(count_file, '/', len(filepaths))
In [23]:
import os
with open(os.path.expanduser('~/cltk_data/user_data/tlg_sentences.py'), 'w') as f: # 1.0 GB
f.write(str(tlg_sentences))
In [24]:
print(tlg_sentences[:5])
print('Total sentences:', len(tlg_sentences))
In [25]:
model = Word2Vec(sentences=tlg_sentences, size=100, window=5, min_count=5, workers=4) #! this took ~1.5 hrs
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)
In [26]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_tlg.model')
In [27]:
model.save(model_path) # word2vec_tlg.model.syn0.npy: 155 MB; word2vec_tlg.model.syn1.npy: 155 MB; word2vec_tlg.model: 88MB
In [106]:
# to load:
model = Word2Vec.load(model_path)
In [29]:
model.most_similar('ἔπεμπον')
Out[29]:
In [32]:
model.most_similar('ἄγγελος')
Out[32]:
In [38]:
model.most_similar('ἄχος')
Out[38]:
In [39]:
model.most_similar('κῆρ')
Out[39]:
In [40]:
model.most_similar('ἄνδρα')
Out[40]:
In [42]:
model.most_similar('παῖδα')
Out[42]:
In [43]:
model.most_similar('ἱερέα')
Out[43]:
In [52]:
model.most_similar('γυναικὸς')
Out[52]:
In [111]:
# "puer" is to "pater" as "filia" is to ...?
model.most_similar(['filia', 'pater'], ['puer'], topn=3) # 'should' be mater!
Out[111]:
In [46]:
# which word doesn't go with the others?
model.doesnt_match("παῖδα ἄνδρα ναὸν".split())
Out[46]:
In [51]:
#model.similarity('γινώσκω', 'ἔχω')
Out[51]:
In [53]:
model.similarity('ἄνδρα', 'ἀνὴρ') # strange?
Out[53]:
In [54]:
model.similarity('ἄνδρα', 'ὄργανον')
Out[54]:
In [55]:
model.similarity('ἀνὴρ', 'ὄργανον')
Out[55]:
In [56]:
model.similarity('ὄνομα', 'ὄργανον')
Out[56]:
In [ ]: