In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import os
In [2]:
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi_lemma.model')
model = Word2Vec.load(model_path)
In [3]:
#vocab = model.vocab
vocab = model.vocab
In [10]:
path = os.path.expanduser('~/cltk_data/user_data/latin_word2vec_most_sims.txt')
# empty file if exists
if os.path.exists(path):
with open(path, 'w') as fo:
fo.write('')
for x in vocab:
syn_list = []
pairs = model.most_similar(x)
for k,v in pairs:
if v > 0.50:
syn_list.append(k)
if syn_list:
with open(path, 'a') as fo:
line = x + '\t' + str(syn_list) + '\n'
fo.write(line)
In [ ]: