The following are tests of Word2Vec w/ Latin and Greek. For each language, I'm testing the results gained various paramters (stopword remove, lemmatization, size of vectors, etc.).
In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import logging
import os
import time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
def gen_docs(corpus, lemmatize, rm_stops, testing):
# TODO: Replace accented chars with un
punkt = PunktLanguageVars()
if corpus == 'phi5':
language = 'latin'
filepaths = assemble_phi5_author_filepaths()
jv_replacer = JVReplacer()
text_cleaner = phi5_plaintext_cleanup
if rm_stops:
stops = latin_stops
else:
stops = None
elif corpus == 'tlg':
language = 'greek'
filepaths = assemble_tlg_author_filepaths()
text_cleaner = tlg_plaintext_cleanup
if rm_stops:
stops = latin_stops
else:
stops = None
if lemmatize:
lemmatizer = LemmaReplacer(language)
if testing:
filepaths = filepaths[:20]
sent_tokenizer = TokenizeSentence(language)
for filepath in filepaths:
with open(filepath) as f:
text = f.read()
# light first-pass cleanup, before sentence tokenization (which relies on punctuation)
text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
sent_tokens = sent_tokenizer.tokenize_sentences(text)
doc_sentences = []
for sentence in sent_tokens:
# a second cleanup at sentence-level, to rm all punctuation
sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
sentence = punkt.word_tokenize(sentence)
sentence = [s.lower() for s in sentence]
sentence = [w for w in sentence if w]
if stops:
sentence = [w for w in sentence if w not in stops]
#sentence = [w for w in sentence if len(w) > 1] # rm short words
if sentence:
sentence = sentence
if lemmatize:
sentence = lemmatizer.lemmatize(sentence)
if sentence and language == 'latin':
sentence = [jv_replacer.replace(word) for word in sentence]
if sentence != []:
doc_sentences.append(sentence)
if doc_sentences != []:
yield doc_sentences
In [3]:
def make_model(corpus, lemmatize=False, rm_stops=False, size=100, window=10, min_count=5, workers=4, sg=1, testing=False, save_path=None):
# (1) load empty model, add vocab, save training
# https://groups.google.com/forum/#!topic/gensim/xXKz-v8brAI
model = Word2Vec(sentences=None, size=size, window=window, min_count=min_count, workers=workers, sg=sg)
docs = gen_docs(corpus, lemmatize=lemmatize, rm_stops=rm_stops, testing=testing)
vocab_counter = 0
alert_per_processed = 100
for sentences in docs:
vocab_counter += 1
model.build_vocab(sentences)
if vocab_counter % alert_per_processed == 0:
print('Building vocab:', vocab_counter)
docs = gen_docs(corpus, lemmatize=lemmatize, rm_stops=rm_stops, testing=testing)
train_counter = 0
for sentences in docs:
train_counter += 1
try:
model.train(sentences)
except Exception as e:
print(e)
if train_counter % alert_per_processed == 0:
print('Training model:', train_counter)
model.init_sims(replace=True)
if save_path:
save_path = os.path.expanduser(save_path)
model.save(save_path)
else:
return model
In [4]:
def make_options():
corpus = ['phi5', 'tlg']
lemmatize = [True, False]
rm_stops = [True, False]
window = [5, 10, 20]
size = [100, 200, 500, 1000]
skip_gram = [True, False]
for c in corpus:
for l in lemmatize:
for r in rm_stops:
for w in window:
for s in size:
for sg in skip_gram:
option = (c, l, r, w, s, sg)
yield option
This is just a single test build for confirming everything works right. Should run in under 30 seconds.
In [ ]:
'''
test_model = None
test_model = make_model(corpus='tlg', lemmatize=False, rm_stops=False, size=100, window=10, min_count=5, workers=4, sg=1, testing=True)
print(len(test_model.vocab))
#print(test_model.vocab)
'''
Out[ ]:
In [ ]:
options = make_options()
log_path = os.path.expanduser('~/word2vec_tests_20150726/make_models.log')
with open(log_path, 'w') as file_opened:
file_opened.write('')
for option in options:
if option[0] == 'phi5':
option_lang = ('latin', option)
elif option[0] == 'tlg':
option_lang = ('greek', option)
lang = option_lang[0]
corpus = option_lang[1][0]
lemmatize = option_lang[1][1]
rm_stops = option_lang[1][2]
window = option_lang[1][3]
size = option_lang[1][4]
skip_gram = option_lang[1][5]
if skip_gram == True:
sg = 1
else:
sg = 0 # ie, use cbow
model_path = os.path.join('~/word2vec_tests_20150726/',
lang,
corpus + '_' + 'lemmatize' + str(lemmatize) + '_' + 'rmstops' + str(rm_stops) + '_' + 'window' + str(window) + '_' + 'size' + str(size) + '_' + 'sg' + str(skip_gram) + '.model')
model_path = os.path.expanduser(model_path)
start = time.time()
try:
make_model(corpus=corpus, lemmatize=lemmatize, rm_stops=rm_stops, size=size, window=window, min_count=5, workers=4, sg=sg, testing=True, save_path=model_path)
build_time = '\nBuild time: {}'.format(time.time() - start)
log_text = model_path + '\n' + str(build_time) + '\n'
with open(log_path, 'a') as file_opened:
file_opened.write(log_text)
except Exception as e:
with open(log_path, 'a') as file_opened:
file_opened.write('Build failed for: {0}.\n{1}'.format(model_path, e))
try:
#test_model = test_open(model_path)
Word2Vec.load(model_path)
except:
with open(log_path, 'a') as file_opened:
file_opened.write('Loading failed.\n')
In [ ]: