In [2]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Doc2Vec
import logging
import os
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
def gen_docs_bow(corpus, lemmatize, rm_stops, testing):
    """This returns a list of tokenized words for an entire document; no sentence tokenization."""
    punkt = PunktLanguageVars()
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:20]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()

        text = text_cleaner(text, rm_punctuation=True, rm_periods=True)
        words = punkt.word_tokenize(text)
        words = [w.lower() for w in words]
        words = [w for w in words if w]

        if stops:
            words = [w for w in words if w not in stops]
        words = [w for w in words if len(w) > 1]  # rm short words

        if words:
            words = words
        if words and language == 'latin':
            words = [jv_replacer.replace(word) for word in words]
        if lemmatize:
            words = lemmatizer.lemmatize(words)
        
        # dirty hack to ch incorrect 'edo1' lemmas
        new_words = []
        for word in words:
            if word == 'edo1':
                word = 'sum1'
            new_words.append(word)

        yield new_words

In [10]:
model = Doc2Vec(sentences=None, size=100, window=8, min_count=5, workers=4)
#, size=size, window=window, min_count=min_count, workers=workers, sg=sg

docs = gen_docs_bow('phi5', lemmatize=False, rm_stops=False, testing=True)

vocab_counter = 0
alert_per_processed = 100
for doc in docs:
    print(doc)
    input()
    vocab_counter += 1
    model.build_vocab(doc)
    if vocab_counter % alert_per_processed == 0:
        print('Building vocab:', vocab_counter)

docs = gen_docs(corpus, lemmatize=lemmatize, rm_stops=rm_stops, testing=testing)
train_counter = 0
for doc in docs:
    train_counter += 1
    try:
        model.train(doc)
    except Exception as e:
        print(e)
    if train_counter % alert_per_processed == 0:
        print('Training model:', train_counter)

model.init_sims(replace=True)
if save_path:
    save_path = os.path.expanduser(save_path)
    model.save(save_path)


['nam', 'ut', 'aliis', 'plerumque', 'obuenienti', 'magistratu', 'ob', 'metum', 'statuae', 'polliceantur', 'nunc', 'quod', 'ad', 'illum', 'attinet', 'quirites', 'quoniam', 'se', 'ampliorem', 'putat', 'esse', 'si', 'se', 'mihi', 'inimicum', 'dictitarit', 'quem', 'ego', 'mihi', 'neque', 'amicum', 'recipio', 'neque', 'inimicum', 'respicio', 'in', 'eum', 'ego', 'non', 'sum', 'plura', 'dicturus', 'nam', 'cum', 'indignissimum', 'arbitror', 'cui', 'uiris', 'bonis', 'benedicatur', 'tum', 'ne', 'idoneum', 'quidem', 'cui', 'probis', 'maledicatur', 'nam', 'si', 'in', 'eo', 'tempore', 'huiusmodi', 'homunculum', 'nomines', 'in', 'quo', 'punire', 'non', 'possis', 'maiore', 'honore', 'quam', 'contumelia', 'adficias', 'qua', 'in', 're', 'quanto', 'uniuersi', 'me', 'unum', 'antistatis', 'tanto', 'uobis', 'quam', 'mihi', 'maiorem', 'iniuriam', 'atque', 'contumeliam', 'facit', 'quirites', 'et', 'quanto', 'probi', 'iniuriam', 'facilius', 'accipiunt', 'quam', 'alteri', 'tradunt', 'tanto', 'ille', 'uobis', 'quam', 'mihi', 'peiorem', 'honorem', 'habuit', 'nam', 'me', 'iniuriam', 'ferre', 'uos', 'facere', 'uult', 'quirites', 'ut', 'hic', 'conquestio', 'istic', 'uituperatio', 'relinquatur', 'cum', 'sese', 'sciret', 'in', 'tantum', 'crimen', 'uenisse', 'atque', 'socios', 'ad', 'senatum', 'questum', 'flentes', 'uenisse', 'sese', 'pecunias', 'maximas', 'exactos', 'esse']

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-c49de72768dc> in <module>()
     10     input()
     11     vocab_counter += 1
---> 12     model.build_vocab(doc)
     13     if vocab_counter % alert_per_processed == 0:
     14         print('Building vocab:', vocab_counter)

/Users/kyle/cltk/venv/lib/python3.4/site-packages/gensim/models/word2vec.py in build_vocab(self, sentences)
    397         """
    398         logger.info("collecting all words and their counts")
--> 399         vocab = self._vocab_from(sentences)
    400         # assign a unique index to each word
    401         self.vocab, self.index2word = {}, []

/Users/kyle/cltk/venv/lib/python3.4/site-packages/gensim/models/doc2vec.py in _vocab_from(sentences)
    198                 logger.info("PROGRESS: at item #%i, processed %i words and %i word types" %
    199                             (sentence_no, total_words, len(vocab)))
--> 200             sentence_length = len(sentence.words)
    201             for label in sentence.labels:
    202                 total_words += 1

AttributeError: 'str' object has no attribute 'words'

In [ ]: