In [1]:
"""Topic Modeling"""
# LDA: Latent Dirichlet Allocation (topic modeling)
"another LDA: Linear Discriminant Analysis, use for classification"
Out[1]:
In [7]:
## Building a topic model
from gensim import corpora, models, matutils
# Load the data
## "corpus" variable holds all of the text documents and has loaded them in a format that makes for easy processing
corpus = corpora.BleiCorpus('./ch04/ap/ap.dat', './ch04/ap/vocab.txt')
# Build the topic model
NUM_TOPICS = 100
model = models.ldamodel.LdaModel(
corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
# some model attrs
doc = corpus.docbyoffset(0)
topics = model[doc]
print topics
In [12]:
# plot it
import matplotlib.pyplot as plt
import numpy as np
num_topics_used = [len(model[doc]) for doc in corpus]
fig, ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('ch04/Figure_04_01.png')
# fig.show()
In [13]:
# Now, repeat the same exercise using alpha=1.0
# You can edit the constant below to play around with this parameter
ALPHA = 1.0
model1 = models.ldamodel.LdaModel(
corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
num_topics_used1 = [len(model1[doc]) for doc in corpus]
fig,ax = plt.subplots()
ax.hist([num_topics_used, num_topics_used1], np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
# The coordinates below were fit by trial and error to look good
ax.text(9, 223, r'default alpha')
ax.text(26, 156, 'alpha=1.0')
fig.tight_layout()
fig.savefig('ch04/Figure_04_02.png')
In [14]:
# Comparing docs by topics
In [19]:
from gensim import corpora, models, matutils
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()
from scipy.spatial import distance
pairwise = distance.squareform(distance.pdist(topics))
large = pairwise.max() + 1
for i in range(len(pairwise)):
pairwise[i, i] = large
def close_to(doc_id):
return pairwise[doc_id].argmin()
In [29]:
"""word cloud"""
from __future__ import print_function
warned_of_error = False
def create_cloud(oname, words, maxsize=120, fontname='Lobster'):
'''Creates a word cloud (when pytagcloud is installed)
Parameters
----------
oname : output filename
words : list of (value,str)
maxsize : int, optional
Size of maximum word. The best setting for this parameter will often
require some manual tuning for each input.
fontname : str, optional
Font to use.
'''
try:
from pytagcloud import create_tag_image, make_tags
except ImportError:
if not warned_of_error:
print("Could not import pytagcloud. Skipping cloud generation")
return
# gensim returns a weight between 0 and 1 for each word, while pytagcloud
# expects an integer word count. So, we multiply by a large number and
# round. For a visualization this is an adequate approximation.
words = [(w, int(v*10000)) for w, v in words]
tags = make_tags(words, maxsize=maxsize)
create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
# We first identify the most discussed topic, i.e., the one with the
# highest total weight
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()
# Get the top 64 words for this topic
# Without the argument, show_topic would return only 10 words
words = model.show_topic(max_topic, 64)
# This function will actually check for the presence of pytagcloud and is otherwise a no-op
create_cloud('./ch04/cloud_lda.png', words)
In [24]:
In [ ]: