In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stop.latin.stops import STOPS_LIST
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %message)s', level=logging.INFO)
In [2]:
filepaths = assemble_phi5_author_filepaths()
sent_tokenizer = TokenizeSentence('latin')
p = PunktLanguageVars()
In [89]:
phi_sentences = []
for filepath in filepaths:
with open(filepath) as f:
text_raw = f.read()
text_clean = phi5_plaintext_cleanup(text_raw) # phi5_plaintext_cleanup()
sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean) # sentence tokenize
sent_tokens = [s.lower() for s in sent_tokens_upper] # lowercase
#sentence_tokens_author = []
for sent in sent_tokens: # tokenize words in sentences
sent_word_tokens = []
sent_word_tokens = p.word_tokenize(sent)
sent_word_tokens_new = []
for word in sent_word_tokens: # remove punctuation (final period, commas, etc)
if word[-1] in ['.', '“']:
word_new = word[:-1]
sent_word_tokens_new.append(word_new)
elif word[0] =='“':
word_new = word[1:]
sent_word_tokens_new.append(word_new)
elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']:
continue
elif word in STOPS_LIST: # remove stops
continue
elif '˘' in word: # rm meter
continue
elif 'á' in word: # rm accents from vowels; find more graceful way of doing this
word_new = word.replace('á', 'a')
sent_word_tokens_new.append(word_new)
elif 'é' in word:
word_new = word.replace('é', 'e')
sent_word_tokens_new.append(word_new)
elif 'í' in word:
word_new = word.replace('í', 'i')
sent_word_tokens_new.append(word_new)
elif 'ó' in word: #! no 'ó' found in PHI5
word_new = word.replace('ó', 'o')
sent_word_tokens_new.append(word_new)
print('rmd vowel', word, word_new)
elif 'ú' in word:
word_new = word.replace('ú', 'u')
sent_word_tokens_new.append(word_new)
else:
sent_word_tokens_new.append(word)
sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1] # rm short words
sentence = [w for w in sent_word_tokens_new if w] # remove any empty words (created thru above cleanup)
if sentence: # remove any empty sentences (created thru above cleanup)
phi_sentences.append(sentence)
In [92]:
print(phi_sentences[:5])
print('Total sentences:', len(phi_sentences))
In [124]:
model = Word2Vec(sentences=phi_sentences, size=100, window=5, min_count=5, workers=4)
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)
In [3]:
import os
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi.model')
In [105]:
model.save(model_path) # 84 MB
In [4]:
# to load:
model = Word2Vec.load(model_path)
In [107]:
model.most_similar('memor')
Out[107]:
In [108]:
model.most_similar('amat')
Out[108]:
In [109]:
model.most_similar('aeneas')
Out[109]:
In [110]:
model.most_similar('cura')
Out[110]:
In [5]:
# "puer" is to "pater" as "filia" is to ...?
model.most_similar(['filia', 'pater'], ['puer'], topn=3) # 'should' be mater!
Out[5]:
In [113]:
# which word doesn't go with the others?
model.doesnt_match("filius pater mater canis".split())
Out[113]:
In [119]:
model.similarity('pater', 'mater')
Out[119]:
In [121]:
model.similarity('pater', 'canis')
Out[121]:
In [123]:
model['hasta']
Out[123]:
In [ ]: