Following tutorial "Topic Modeling for Fun and Profit"
In [1]:
import itertools
import logging
import os
import pickle
import sys
import time
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import gensim
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import simple_preprocess
import numpy as np
In [2]:
# put current dir in path, for importing local module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
In [3]:
# import local module
from lda_helpers import mk_working_dir
from lda_helpers import working_dir
from lda_helpers import tokenize
from lda_helpers import iter_docs
from lda_helpers import PREPROCESS_DEACCENT
from lda_helpers import TOK_MIN
from lda_helpers import TOK_MAX
from lda_helpers import DOC_MIN
from lda_helpers import remove_ascii
from lda_helpers import STOPS_LIST
from lda_helpers import no_below
from lda_helpers import no_above
from lda_helpers import GenerateCorpus
In [4]:
# enable verbose print-to-screen logging for Gensim
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore
In [5]:
# where our results will go in ~/cltk_data/user_data
mk_working_dir(working_dir)
In [6]:
# Take a look at the docs post-processing
# Open corpus iterator
# docs_path_rel'~/cltk_data/greek/text/tlg/plaintext/' # for tlg
docs_path_rel = '~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/'
docs_preprocessed = os.path.expanduser(docs_path_rel)
stream = iter_docs(docs_preprocessed, rm_ascii=remove_ascii)
for title, tokens in itertools.islice(iter_docs(docs_preprocessed, rm_ascii=remove_ascii), 8):
print(title, tokens[:10]) # print the article title and its first ten tokens
In [7]:
# Open corpus iterator
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed, rm_ascii=remove_ascii))
In [8]:
# store the dictionary, for future reference
dict_name = 'gensim_dict_id2word_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.dict'.format(no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
dict_path = os.path.join(working_dir, dict_name)
# consider doing same filtering as done in the class, then combinging count
try:
id2word_map = gensim.corpora.dictionary.Dictionary.load(dict_path)
except FileNotFoundError:
t0 = time.time()
# ~4 min on TLG corpus if rm accents; ~w min if not
id2word_map = gensim.corpora.Dictionary(doc_stream)
# this cutoff might lose too much info, we'll see
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_map.filter_extremes(no_below=no_below, no_above=no_above)
id2word_map.save(dict_path)
print('Time to mk new corpus dictionary:', time.time() - t0)
print(id2word_map)
In [9]:
# Illustrate what this BoW space looks like with example doc
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_ascii))
print(bow) # words both in BoW dict and doc
print(id2word_map[bow[0][0]]) # map int back to str
In [10]:
clip_docs_at = 25 # None for final
# make the BoW corpus
# creates a stream of bag-of-words vectors
corpus_bow_tlg = GenerateCorpus(docs_preprocessed, id2word_map, clip_docs=clip_docs_at)
# reduce corpus size for faster testing
#corpus_bow_tlg = gensim.utils.ClippedCorpus(corpus_bow_tlg, 100)
# vector = next(iter(corpus_bow_tlg))
# print(vector) # print the first vector in the stream
# [(0, 1), (1, 1), (2, 1), ...]
# # what is the most common word in that first article?
# most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
# print(id2word_map[most_index], most_count) # μιλησιοις 2
In [11]:
# Save BoW
# ~4 min on TLG corpus
bow_name = 'gensim_bow_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
bow_path = os.path.join(working_dir, bow_name)
t0 = time.time()
gensim.corpora.MmCorpus.serialize(bow_path, corpus_bow_tlg)
print('Time to save BoW space:', time.time() - t0)
# Later load saved corpus with:
# corpus_bow_tlg = gensim.corpora.MmCorpus(bow_path)
In [12]:
total_included_docs = len(corpus_bow_tlg.titles) # used later for testing results
In [13]:
# Quick testing using just a part of the corpus
NUM_TOPICS_LIST = [2, 3, 5, 10, 25, 50, 100]
NUM_TOPICS_LIST.append(len(get_epithets())) # mk topics same number as traditional epithets
NUM_TOPICS_LIST = sorted(NUM_TOPICS_LIST)
PASSES = 1
In [15]:
for num_topics in NUM_TOPICS_LIST:
print('Beginning training ...')
print('... {} topics and {} passes ...'.format(num_topics, PASSES))
t0 = time.time()
lda_model = gensim.models.LdaMulticore(corpus_bow_tlg, num_topics=num_topics, id2word=id2word_map, passes=PASSES)
# save LDA vector space
lda_space_name = 'gensim_lda_space_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(num_topics,
PASSES,
no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
path_lda = os.path.join(working_dir, lda_space_name)
gensim.corpora.MmCorpus.serialize(path_lda, lda_model[corpus_bow_tlg])
# save model
lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics,
PASSES,
no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
# path_lda = os.path.join(working_dir, lda_model_name)
path_lda = os.path.join('.', lda_model_name)
lda_model.save(path_lda)
print('Time to train LDA model space:', time.time() - t0)
In [16]:
# # Examples of how to use the model
# lda_model.print_topics(-1) # print a few most important words for each LDA topic
# # transform text into the bag-of-words space
# bow_vector = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_non_ascii))
# print([(id2word_map[id], count) for id, count in bow_vector])
# # transform into LDA space
# lda_vector = lda_model[bow_vector]
# print(lda_vector)
# # print the document's single most prominent LDA topic
# print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))
For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)
In [17]:
for num_topics in NUM_TOPICS_LIST:
# load model
lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics,
PASSES,
no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
print('Loading model: {} ...'.format(lda_model_name))
print('... for word intrusion testing ...')
path_lda = os.path.join(working_dir, lda_model_name)
lda_model = gensim.models.LdaMulticore.load(path_lda)
# select top 50 words for each of the LDA topics
print('Top 50 words of each LDA model:')
top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)
print('')
# get all top 50 words in all 20 topics, as one large set
all_words = set(itertools.chain.from_iterable(top_words))
print("Can you spot the misplaced word in each topic?")
# for each topic, replace a word at a different index, to make it more interesting
replace_index = np.random.randint(0, 10, lda_model.num_topics)
replacements = []
for topicno, words in enumerate(top_words):
other_words = all_words.difference(words)
replacement = np.random.choice(list(other_words))
replacements.append((words[replace_index[topicno]], replacement))
words[replace_index[topicno]] = replacement
print("%i: %s" % (topicno, ' '.join(words[:10])))
print("Actual replacements were:")
print(list(enumerate(replacements)))
print('')
In [ ]:
# evaluate on 1k documents **not** used in LDA training
docs_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed)) # generator
test_docs = list(itertools.islice(doc_stream, 100, 200)) # ['πανυ', 'καλως', ...], [...], ...]
In [ ]:
def intra_inter(model, test_docs, num_pairs=10000):
# split each test document into two halves and compute topics for each half
part1 = [model[id2word_map.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
part2 = [model[id2word_map.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
# print computed similarities (uses cossim)
print("average cosine similarity between corresponding parts (higher is better):")
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))
print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))
In [ ]:
for num_topics in NUM_TOPICS_LIST:
# load model
lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics,
PASSES,
no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
print('Loading model: {} ...'.format(lda_model_name))
print('... for testing split document topic matching ...')
path_lda = os.path.join(working_dir, lda_model_name)
lda_model = gensim.models.LdaMulticore.load(path_lda)
print("LDA results:")
# what should num_pairs be?
intra_inter(lda_model, test_docs, num_pairs=total_included_docs)
print('')
In [ ]:
id_auth_map = get_id_author()
In [ ]:
# write to file topics for each doc
for num_topics in NUM_TOPICS_LIST:
print('num topics', num_topics)
# load model
lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics,
PASSES,
no_below,
no_above,
TOK_MIN,
TOK_MAX,
DOC_MIN,
PREPROCESS_DEACCENT)
print('Loading model: {} ...'.format(lda_model_name))
print('... scoring topics of all documents ...')
path_lda = os.path.join(working_dir, lda_model_name)
# https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.get_document_topics
lda_model = gensim.models.LdaMulticore.load(path_lda)
# mk save path name
scores_name = lda_model_name.rstrip('.model') + '.scores'
scores_path = os.path.join(working_dir, scores_name)
doc_topics = ''
print('Going to write LDA scores for each file at: "{}"'.format(scores_path))
for file_name, tokens in iter_docs(docs_preprocessed):
# print(file_name, tokens[:10]) # print the article title and its first ten tokens
# print(file_name)
topic_distribution = str(lda_model[id2word_map.doc2bow(tokens)])
# print(topic_distribution)
# convert file name to author name, and get epithet
# auth_id = file_name.lstrip('TLG').rstrip('.TXT') # for TLG
auth_id = file_name.rstrip('.txt') # for 1K Greek
auth_name = None
auth_epithet = None
# auth_name = id_auth_map[auth_id] # for TLG
# auth_epithet = str(get_epithet_of_author(auth_id)) # for TLG
doc_topics += 'file: ' + file_name + '\n'
doc_topics += 'author: ' + auth_name + '\n'
doc_topics += 'epithet: ' + auth_epithet + '\n'
doc_topics += topic_distribution + '\n\n'
print('Wrote file to: "{}"'.format(scores_path))
with open(scores_path, 'w') as file_open:
file_open.write(doc_topics)
print('')
In [ ]: