In [1]:
import itertools
import logging
import os
import pickle
import sys
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import gensim
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import simple_preprocess
import numpy as np

In [2]:
# put current dir in path, for importing local module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
# import local module
from lda_helpers import mk_working_dir
from lda_helpers import working_dir
from lda_helpers import tokenize
from lda_helpers import iter_docs
from lda_helpers import PREPROCESS_DEACCENT
from lda_helpers import TOK_MIN
from lda_helpers import TOK_MAX
from lda_helpers import DOC_MIN
from lda_helpers import remove_ascii
from lda_helpers import STOPS_LIST
from lda_helpers import no_below
from lda_helpers import no_above
from lda_helpers import GenerateCorpus

In [4]:
# enable verbose print-to-screen logging for Gensim
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [5]:
# where our results will go in ~/cltk_data/user_data
mk_working_dir(working_dir)

In [6]:
# Take a look at the docs post-processing
# Open corpus iterator
# docs_path_rel'~/cltk_data/greek/text/tlg/plaintext/'  # for tlg
docs_path_rel = '~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/'
docs_preprocessed = os.path.expanduser(docs_path_rel)
stream = iter_docs(docs_preprocessed, rm_ascii=remove_ascii)
for title, tokens in itertools.islice(iter_docs(docs_preprocessed, rm_ascii=remove_ascii), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens


tlg4102.tlg006.opp-grc1.txt ['ερμηνεία', 'διαφόρων', 'εἰσ', 'κατά', 'λουκᾶν', 'εὐαγγέλιον', 'ὅτι', 'μὲν', 'ἄλλοι', 'εὐαγγελισταὶ']
tlg2200.tlg00518.opp-grc1.txt ['νόμος', 'τὸν', 'ξενίας', 'ἁλόντα', 'πιπράσκεσθαι', 'ἑάλω', 'ξενίας', 'δημοσθένης', 'ἔπεμψε', 'φίλιππος']
tlg0062.tlg050.1st1K-grc1.txt ['θεων', 'εκκλησια', 'ζευς', 'μηκέτι', 'τονθορύζετε', 'θεοί', 'μηδὲ', 'κατὰ', 'γωνίας', 'συστρεφόμενοι']
tlg2959.tlg011.opp-ger1.txt ['βραβεῖον', 'δυναστείαις', 'κατὰ', 'τὸν', 'ἰώβ', 'καὶ', 'κοπρία', 'παντὸς', 'θρόνου', 'βασιλικοῦ']
tlg2000.tlg001.opp-grc2.txt ['ζωιον', 'και', 'τις', 'ανθρωπος', 'ἡδοναὶ', 'καὶ', 'λῦπαι', 'φόβοι', 'καὶ', 'θάρρη']
tlg0057.tlg014.1st1K-grc1.txt ['γαληνου', 'περι', 'νευρων', 'ανατομης', 'βιβλιον', 'ὅτι', 'μὲν', 'οὐδὲν', 'τῶν', 'τοῦ']
tlg0086.tlg042.1st1K-grc1.txt ['περι', 'υπνου', 'και', 'εγρηγορσεως', 'περὶ', 'ὕπνου', 'καὶ', 'ἐγρηγόρσεως', 'σκεπτέον', 'τίνα']
tlg0057.tlg031.1st1K-grc1.txt ['γαληνου', 'περι', 'χρειας', 'σφυγμων', 'βιβλιον', 'τίς', 'χρεία', 'τῶν', 'σφυγμῶν', 'ἆρά']

Mk word dictionaries


In [7]:
# Open corpus iterator
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed, rm_ascii=remove_ascii))

In [8]:
# store the dictionary, for future reference
dict_name = 'gensim_dict_id2word_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.dict'.format(no_below, 
                                                                                                            no_above, 
                                                                                                            TOK_MIN, 
                                                                                                            TOK_MAX, 
                                                                                                            DOC_MIN, 
                                                                                                            PREPROCESS_DEACCENT)
dict_path = os.path.join(working_dir, dict_name)

# consider doing same filtering as done in the class, then combinging count
try:
    id2word_map = gensim.corpora.dictionary.Dictionary.load(dict_path)
except FileNotFoundError:
    t0 = time.time()
    # ~4 min on TLG corpus if rm accents; ~w min if not
    id2word_map = gensim.corpora.Dictionary(doc_stream)
    # this cutoff might lose too much info, we'll see
    # ignore words that appear in less than 20 documents or more than 10% documents
    id2word_map.filter_extremes(no_below=no_below, no_above=no_above)
    id2word_map.save(dict_path)
    print('Time to mk new corpus dictionary:', time.time() - t0)
print(id2word_map)


INFO : loading Dictionary object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict
Dictionary(22820 unique tokens: ['λουκᾶν', 'ἄρχονται', 'λουκᾶς', 'προοιμίοις', 'ζαχαρίου']...)

Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.


In [9]:
# Illustrate what this BoW space looks like with example doc
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_ascii))
print(bow)  # words both in BoW dict and doc
print(id2word_map[bow[0][0]])  # map int back to str


[(2469, 1), (5973, 1)]
ποίησις

In [10]:
clip_docs_at = 25 # None for final
# make the BoW corpus
# creates a stream of bag-of-words vectors
corpus_bow_tlg = GenerateCorpus(docs_preprocessed, id2word_map, clip_docs=clip_docs_at)

# reduce corpus size for faster testing
#corpus_bow_tlg = gensim.utils.ClippedCorpus(corpus_bow_tlg, 100)

# vector = next(iter(corpus_bow_tlg))
# print(vector)  # print the first vector in the stream
# [(0, 1), (1, 1), (2, 1), ...]

# # what is the most common word in that first article?
# most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
# print(id2word_map[most_index], most_count)  # μιλησιοις 2

In [11]:
# Save BoW
# ~4 min on TLG corpus
bow_name = 'gensim_bow_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(no_below, 
                                                                                                no_above, 
                                                                                                TOK_MIN, 
                                                                                                TOK_MAX, 
                                                                                                DOC_MIN, 
                                                                                                PREPROCESS_DEACCENT)
bow_path = os.path.join(working_dir, bow_name)
t0 = time.time()
gensim.corpora.MmCorpus.serialize(bow_path, corpus_bow_tlg)
print('Time to save BoW space:', time.time() - t0)

# Later load saved corpus with:
# corpus_bow_tlg = gensim.corpora.MmCorpus(bow_path)


INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x22128 matrix, density=4.745% (26247/553200)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
Time to save BoW space: 3.010697364807129

In [12]:
total_included_docs = len(corpus_bow_tlg.titles)  # used later for testing results

LDA transformation


In [13]:
# Quick testing using just a part of the corpus

NUM_TOPICS_LIST = [2, 3, 5, 10, 25, 50, 100]
NUM_TOPICS_LIST.append(len(get_epithets()))  # mk topics same number as traditional epithets
NUM_TOPICS_LIST = sorted(NUM_TOPICS_LIST)
PASSES = 1

In [15]:
for num_topics in NUM_TOPICS_LIST:
    print('Beginning training ...')
    print('... {} topics and {} passes ...'.format(num_topics, PASSES))
    t0 = time.time()
    lda_model = gensim.models.LdaMulticore(corpus_bow_tlg, num_topics=num_topics, id2word=id2word_map, passes=PASSES)
    
    # save LDA vector space
    lda_space_name = 'gensim_lda_space_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(num_topics, 
                                                                                                                                        PASSES, 
                                                                                                                                        no_below, 
                                                                                                                                        no_above, 
                                                                                                                                        TOK_MIN, 
                                                                                                                                        TOK_MAX, 
                                                                                                                                        DOC_MIN, 
                                                                                                                                        PREPROCESS_DEACCENT)
    path_lda = os.path.join(working_dir, lda_space_name)
    gensim.corpora.MmCorpus.serialize(path_lda, lda_model[corpus_bow_tlg])
    
    # save model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    # path_lda = os.path.join(working_dir, lda_model_name)
    path_lda = os.path.join('.', lda_model_name)
    lda_model.save(path_lda)
    print('Time to train LDA model space:', time.time() - t0)


INFO : using symmetric alpha at 0.5
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Beginning training ...
... 2 topics and 1 passes ...
INFO : running online LDA training, 2 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.500): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.002*"κινουμένῳ" + 0.001*"αὐτοκίνητον" + 0.001*"νόησις" + 0.001*"ἠρεμεῖ" + 0.001*"ἠρεμία"
INFO : topic #1 (0.500): 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"εὔδημος" + 0.002*"ἀπείρῳ" + 0.002*"αὐτοκίνητον" + 0.002*"στερήσεως" + 0.001*"προσεχῶς" + 0.001*"ἠρεμεῖ"
INFO : topic diff=1.113706, rho=1.000000
INFO : -9.604 per-word bound, 778.4 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x2 matrix, density=78.000% (39/50)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.3333333333333333
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 9.396849632263184
Beginning training ...
... 3 topics and 1 passes ...
INFO : running online LDA training, 3 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.333): 0.002*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"ἀπείρου" + 0.002*"κινουμένῳ" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.001*"διαιρετὸν" + 0.001*"νόησις" + 0.001*"ἠρεμία" + 0.001*"ἀλλʼ"
INFO : topic #1 (0.333): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.001*"διαιρετόν" + 0.001*"στερήσεως" + 0.001*"νόησις"
INFO : topic #2 (0.333): 0.004*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"ἠρεμεῖ" + 0.002*"ἀπείρῳ" + 0.002*"αὐτοκίνητον" + 0.002*"ἠρεμία"
INFO : topic diff=1.423865, rho=1.000000
INFO : -9.732 per-word bound, 850.2 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x3 matrix, density=54.667% (41/75)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.2
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 9.716853857040405
Beginning training ...
... 5 topics and 1 passes ...
INFO : running online LDA training, 5 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.200): 0.003*"πεπερασμένον" + 0.003*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"ἀπείρου" + 0.002*"ἀπείρῳ" + 0.002*"αὐτοκίνητον" + 0.001*"στερήσεως" + 0.001*"ἠρεμία" + 0.001*"εὔδημος"
INFO : topic #1 (0.200): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.002*"στερήσεως" + 0.002*"νόησις" + 0.002*"διαιρετὸν"
INFO : topic #2 (0.200): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"κινουμένῳ" + 0.002*"μεταβάλλον" + 0.001*"προσεχῶς" + 0.001*"νόησις" + 0.001*"ἠρεμεῖ" + 0.001*"εὔδημος" + 0.001*"κατηγορίαις" + 0.001*"αὐτοκίνητον"
INFO : topic #3 (0.200): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.001*"ἠρεμεῖ" + 0.001*"κινουμένῳ" + 0.001*"νόησις" + 0.001*"αὐτοκίνητον" + 0.001*"ἀδιαίρετον" + 0.001*"προσεχῶς"
INFO : topic #4 (0.200): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"ἠρεμεῖ" + 0.001*"κινουμένῳ" + 0.001*"νόησις" + 0.001*"στερήσεως"
INFO : topic diff=2.138563, rho=1.000000
INFO : -10.137 per-word bound, 1125.7 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x5 matrix, density=36.000% (45/125)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 13.362380266189575
Beginning training ...
... 10 topics and 1 passes ...
INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #3 (0.100): 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.001*"προσεχῶς" + 0.001*"αὐτοκίνητον" + 0.001*"στερήσεως" + 0.001*"φλεβοτομίας" + 0.001*"κινουμένῳ" + 0.001*"διαιρετὸν"
INFO : topic #5 (0.100): 0.002*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"πεπερασμένον" + 0.001*"μεταβάλλον" + 0.001*"κινουμένῳ" + 0.001*"προσεχῶς" + 0.001*"στερήσεως" + 0.001*"αὐτοκίνητον" + 0.001*"ἀπείρῳ" + 0.001*"νόησις"
INFO : topic #6 (0.100): 0.004*"πεπερασμένον" + 0.004*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.002*"ἠρεμία" + 0.002*"αὐτοκίνητον" + 0.001*"ἠρεμεῖ" + 0.001*"ἀπείρῳ"
INFO : topic #0 (0.100): 0.004*"πεπερασμένον" + 0.004*"ἀπείρου" + 0.003*"εὔδημος" + 0.003*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.002*"προσεχῶς" + 0.002*"ἀδιαίρετον" + 0.002*"νόησις" + 0.002*"διαιρετὸν"
INFO : topic #4 (0.100): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.002*"στερήσεως" + 0.002*"ἠρεμία" + 0.002*"μεταβάλλον" + 0.001*"νόησις" + 0.001*"ταὐτομάτου"
INFO : topic diff=4.250175, rho=1.000000
INFO : -11.038 per-word bound, 2102.0 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x10 matrix, density=18.000% (45/250)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.04
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 14.582515954971313
Beginning training ...
... 25 topics and 1 passes ...
INFO : running online LDA training, 25 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #21 (0.040): 0.004*"ἀπείρου" + 0.003*"πεπερασμένον" + 0.002*"φλεβοτομίας" + 0.002*"εὔδημος" + 0.002*"αὐτοκίνητον" + 0.002*"κινουμένῳ" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"στερήσεως" + 0.002*"ἠρεμία"
INFO : topic #18 (0.040): 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"εὔδημος" + 0.002*"ἀπείρου" + 0.002*"προσεχῶς" + 0.002*"διαιρετόν" + 0.001*"νόησις" + 0.001*"ἠρεμία" + 0.001*"αὐτοκίνητον"
INFO : topic #8 (0.040): 0.003*"ἀπείρου" + 0.003*"πεπερασμένον" + 0.002*"νόησις" + 0.002*"κινουμένῳ" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.001*"ἀπείρῳ" + 0.001*"προσεχῶς" + 0.001*"ἀλλοιώσεως"
INFO : topic #19 (0.040): 0.002*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"ἀπείρου" + 0.002*"συζυγία" + 0.001*"νόησις" + 0.001*"μεταβάλλον" + 0.001*"ἠρεμία" + 0.001*"προσεχῶς" + 0.001*"ἀναξαγόρας" + 0.001*"αὐτοκίνητον"
INFO : topic #11 (0.040): 0.003*"ἀπείρου" + 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.002*"ἠρεμεῖ" + 0.002*"ἠρεμία" + 0.002*"κινουμένῳ" + 0.002*"στερήσεως" + 0.002*"εὔδημος" + 0.002*"προσεχῶς"
INFO : topic diff=11.575606, rho=1.000000
INFO : -14.259 per-word bound, 19601.6 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x25 matrix, density=10.400% (65/625)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.02
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 15.639700651168823
Beginning training ...
... 50 topics and 1 passes ...
INFO : running online LDA training, 50 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #7 (0.020): 0.004*"πουλὺ" + 0.002*"τῇσι" + 0.002*"ὁκόσον" + 0.002*"πεπερασμένον" + 0.002*"ὅκου" + 0.002*"ὀδόντες" + 0.002*"λιπαρὸν" + 0.002*"ὁκόταν" + 0.002*"θερμαινόμενον" + 0.002*"ἑβδομάδων"
INFO : topic #30 (0.020): 0.002*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.001*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"ἀδιαίρετον" + 0.001*"φθορά" + 0.001*"εὔδημος" + 0.001*"στερήσεως" + 0.001*"ἠρεμία"
INFO : topic #17 (0.020): 0.003*"πεπερασμένον" + 0.002*"κινουμένῳ" + 0.002*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"νόησις" + 0.002*"ἀπείρῳ" + 0.002*"ἠρεμεῖ" + 0.001*"αὐτοκίνητον" + 0.001*"προσεχῶς"
INFO : topic #33 (0.020): 0.004*"ἀλλʼ" + 0.003*"ἐπʼ" + 0.002*"λίμνην" + 0.002*"οὐδʼ" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"νόησις" + 0.002*"κινουμένῳ"
INFO : topic #32 (0.020): 0.003*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"ἀπείρῳ" + 0.001*"κινουμένῳ" + 0.001*"ἀλλοιώσεως" + 0.001*"νόησις"
INFO : topic diff=24.843323, rho=1.000000
INFO : -19.496 per-word bound, 739596.5 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x50 matrix, density=4.400% (55/1250)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.01818181818181818
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 25.987497091293335
Beginning training ...
... 55 topics and 1 passes ...
INFO : running online LDA training, 55 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #6 (0.018): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"προσεχῶς" + 0.002*"αὐτοκίνητον" + 0.002*"εὔδημος" + 0.002*"κινουμένῳ" + 0.002*"ἀπείρῳ" + 0.002*"μεταβάλλον" + 0.002*"διαιρετὸν" + 0.002*"κατηγορίαις"
INFO : topic #29 (0.018): 0.005*"ἀλλʼ" + 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"στερήσεως" + 0.002*"ἠρεμεῖ" + 0.002*"ἀπείρῳ"
INFO : topic #31 (0.018): 0.006*"πεπερασμένον" + 0.004*"ἀπείρου" + 0.003*"εὔδημος" + 0.003*"μεταβάλλον" + 0.003*"αὐτοκίνητον" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"ἀπείρῳ" + 0.002*"ἠρεμεῖ" + 0.002*"διαιρετόν"
INFO : topic #47 (0.018): 0.003*"πεπερασμένον" + 0.003*"ἀλλʼ" + 0.003*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"νόησις" + 0.001*"στερήσεως" + 0.001*"ἠρεμία" + 0.001*"ἀπείρῳ"
INFO : topic #42 (0.018): 0.003*"νόησις" + 0.002*"πεπερασμένον" + 0.002*"νόησιν" + 0.001*"ἀπείρου" + 0.001*"μεταβάλλον" + 0.001*"νοητῷ" + 0.001*"εὐδαιμονεῖν" + 0.001*"εὔδημος" + 0.001*"στερήσεως" + 0.001*"δικασταί"
INFO : topic diff=27.593496, rho=1.000000
INFO : -20.378 per-word bound, 1362753.8 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x54 matrix, density=4.074% (55/1350)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : using symmetric alpha at 0.01
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node
Time to train LDA model space: 24.59763216972351
Beginning training ...
... 100 topics and 1 passes ...
INFO : running online LDA training, 100 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #81 (0.010): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.003*"ἠρεμεῖ" + 0.003*"κινουμένῳ" + 0.002*"φλεβοτομίας" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"ἠρεμία" + 0.002*"αὐτοκίνητον" + 0.002*"προσεχῶς"
INFO : topic #76 (0.010): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.001*"κινουμένῳ" + 0.001*"ἀναθυμιάσεως" + 0.001*"προσεχῶς" + 0.001*"εὔδημος" + 0.001*"αὐτοκίνητον" + 0.001*"ταὐτομάτου" + 0.001*"ἀλλʼ"
INFO : topic #24 (0.010): 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"νόησις" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"ἀπείρῳ" + 0.001*"κινουμένῳ" + 0.001*"στερήσεως"
INFO : topic #95 (0.010): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.002*"αὐτοκίνητον" + 0.001*"μεταβάλλον" + 0.001*"κατηγορίαις" + 0.001*"ἠρεμία" + 0.001*"ἠρεμεῖ"
INFO : topic #84 (0.010): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.003*"αὐτοκίνητον" + 0.002*"φύσιος" + 0.002*"τῇσι" + 0.002*"κινουμένῳ" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"μάθησις" + 0.002*"ὁκοῖον"
INFO : topic diff=52.896848, rho=1.000000
INFO : -29.331 per-word bound, 675427778.2 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x100 matrix, density=3.080% (77/2500)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_space_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index
INFO : saving LdaState object under ./gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state, separately None
INFO : saved ./gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : saving LdaMulticore object under ./gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to ./gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved ./gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
Time to train LDA model space: 41.29778981208801

In [16]:
# # Examples of how to use the model
# lda_model.print_topics(-1)  # print a few most important words for each LDA topic
# # transform text into the bag-of-words space
# bow_vector = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_non_ascii))
# print([(id2word_map[id], count) for id, count in bow_vector])

# # transform into LDA space
# lda_vector = lda_model[bow_vector]
# print(lda_vector)

# # print the document's single most prominent LDA topic
# print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

Evaluation

Word intrusion

For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)


In [17]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for word intrusion testing ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)
    
    # select top 50 words for each of the LDA topics
    print('Top 50 words of each LDA model:')
    top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
    print(top_words)
    print('')

    # get all top 50 words in all 20 topics, as one large set
    all_words = set(itertools.chain.from_iterable(top_words))
    print("Can you spot the misplaced word in each topic?")

    # for each topic, replace a word at a different index, to make it more interesting
    replace_index = np.random.randint(0, 10, lda_model.num_topics)

    replacements = []
    for topicno, words in enumerate(top_words):
        other_words = all_words.difference(words)
        replacement = np.random.choice(list(other_words))
        replacements.append((words[replace_index[topicno]], replacement))
        words[replace_index[topicno]] = replacement
        print("%i: %s" % (topicno, ' '.join(words[:10])))
    
    print("Actual replacements were:")
    print(list(enumerate(replacements)))
    print('')


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
Loading model: gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for word intrusion testing ...
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-17-54081a63f3ac> in <module>()
     12     print('... for word intrusion testing ...')
     13     path_lda = os.path.join(working_dir, lda_model_name)
---> 14     lda_model = gensim.models.LdaMulticore.load(path_lda)
     15 
     16     # select top 50 words for each of the LDA topics

~/venv3/lib/python3.6/site-packages/gensim/models/ldamodel.py in load(cls, fname, *args, **kwargs)
   1123         """
   1124         kwargs['mmap'] = kwargs.get('mmap', None)
-> 1125         result = super(LdaModel, cls).load(fname, *args, **kwargs)
   1126 
   1127         # check if `random_state` attribute has been set after main pickle load

~/venv3/lib/python3.6/site-packages/gensim/utils.py in load(cls, fname, mmap)
    269         compress, subname = SaveLoad._adapt_by_suffix(fname)
    270 
--> 271         obj = unpickle(fname)
    272         obj._load_specials(fname, mmap, compress, subname)
    273         logger.info("loaded %s", fname)

~/venv3/lib/python3.6/site-packages/gensim/utils.py in unpickle(fname)
    928 def unpickle(fname):
    929     """Load pickled object from `fname`"""
--> 930     with smart_open(fname, 'rb') as f:
    931         # Because of loading from S3 load can't be used (missing readline in smart_open)
    932         if sys.version_info > (3, 0):

~/venv3/lib/python3.6/site-packages/smart_open/smart_open_lib.py in smart_open(uri, mode, **kw)
    138             # local files -- both read & write supported
    139             # compression, if any, is determined by the filename extension (.gz, .bz2)
--> 140             return file_smart_open(parsed_uri.uri_path, mode)
    141         elif parsed_uri.scheme in ("s3", "s3n", "s3u"):
    142             kwargs = {}

~/venv3/lib/python3.6/site-packages/smart_open/smart_open_lib.py in file_smart_open(fname, mode)
    642 
    643     """
--> 644     return compression_wrapper(open(fname, mode), fname, mode)
    645 
    646 

FileNotFoundError: [Errno 2] No such file or directory: '/home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model'

Split doc

We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar


In [ ]:
# evaluate on 1k documents **not** used in LDA training
docs_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [ ]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_map.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_map.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [ ]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for testing split document topic matching ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    print("LDA results:")
    # what should num_pairs be?
    intra_inter(lda_model, test_docs, num_pairs=total_included_docs)
    print('')

Score all docs


In [ ]:
id_auth_map = get_id_author()

In [ ]:
# write to file topics for each doc
for num_topics in NUM_TOPICS_LIST:
    print('num topics', num_topics)
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... scoring topics of all documents ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    # https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.get_document_topics
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    # mk save path name
    scores_name = lda_model_name.rstrip('.model') + '.scores'
    scores_path = os.path.join(working_dir, scores_name)
    doc_topics = ''
    print('Going to write LDA scores for each file at: "{}"'.format(scores_path))
    for file_name, tokens in iter_docs(docs_preprocessed):
        # print(file_name, tokens[:10])  # print the article title and its first ten tokens
        # print(file_name)
        topic_distribution = str(lda_model[id2word_map.doc2bow(tokens)])
        # print(topic_distribution)
        
        # convert file name to author name, and get epithet
        # auth_id = file_name.lstrip('TLG').rstrip('.TXT')  # for TLG
        auth_id = file_name.rstrip('.txt')  # for 1K Greek
        auth_name = None
        auth_epithet = None
        # auth_name = id_auth_map[auth_id]  # for TLG
        # auth_epithet = str(get_epithet_of_author(auth_id))  # for TLG
        
        doc_topics += 'file: ' + file_name + '\n'
        doc_topics += 'author: ' + auth_name + '\n'
        doc_topics += 'epithet: ' + auth_epithet + '\n'
        doc_topics += topic_distribution + '\n\n'
    print('Wrote file to: "{}"'.format(scores_path))
    with open(scores_path, 'w') as file_open:
        file_open.write(doc_topics)
    print('')

In [ ]: