notebook.community

Edit and run



In [1]:

    
"""Topic Modeling"""

# LDA: Latent Dirichlet Allocation (topic modeling)

"another LDA: Linear Discriminant Analysis, use for classification"









    Out[1]:





'another LDA: Linear Discriminant Analysis, use for classification'



In [7]:

    
## Building a topic model

from gensim import corpora, models, matutils

# Load the data
## "corpus" variable holds all of the text documents and has loaded them in a format that makes for easy processing
corpus = corpora.BleiCorpus('./ch04/ap/ap.dat', './ch04/ap/vocab.txt')

# Build the topic model
NUM_TOPICS = 100
model = models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)

# some model attrs
doc = corpus.docbyoffset(0)
topics = model[doc]
print topics









    



[(4, 0.020555383163526932), (12, 0.032550468767385821), (14, 0.013846940809770912), (15, 0.20590805410733648), (16, 0.023017640814599807), (19, 0.049434356424725261), (21, 0.01330177697150811), (42, 0.034213041487348314), (60, 0.013205721360023773), (62, 0.010065894053237543), (68, 0.32440590266582342), (69, 0.071565308652662613), (71, 0.051989900582553039), (92, 0.095182473611664029), (95, 0.015574266384284748)]



In [12]:

    
# plot it 
import matplotlib.pyplot as plt
import numpy as np
num_topics_used = [len(model[doc]) for doc in corpus]
fig, ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
fig.savefig('ch04/Figure_04_01.png')
# fig.show()



In [13]:

    
# Now, repeat the same exercise using alpha=1.0
# You can edit the constant below to play around with this parameter
ALPHA = 1.0

model1 = models.ldamodel.LdaModel(
    corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
num_topics_used1 = [len(model1[doc]) for doc in corpus]

fig,ax = plt.subplots()
ax.hist([num_topics_used, num_topics_used1], np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')

# The coordinates below were fit by trial and error to look good
ax.text(9, 223, r'default alpha')
ax.text(26, 156, 'alpha=1.0')
fig.tight_layout()
fig.savefig('ch04/Figure_04_02.png')



In [14]:

    
# Comparing docs by topics



In [19]:

    
from gensim import corpora, models, matutils
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()

from scipy.spatial import distance
pairwise = distance.squareform(distance.pdist(topics))

large = pairwise.max() + 1
for i in range(len(pairwise)):
    pairwise[i, i] = large
    
def close_to(doc_id):
    return pairwise[doc_id].argmin()



In [29]:

    
"""word cloud"""

from __future__ import print_function
warned_of_error = False

def create_cloud(oname, words, maxsize=120, fontname='Lobster'):
    '''Creates a word cloud (when pytagcloud is installed)

    Parameters
    ----------
    oname : output filename
    words : list of (value,str)
    maxsize : int, optional
        Size of maximum word. The best setting for this parameter will often
        require some manual tuning for each input.
    fontname : str, optional
        Font to use.
    '''
    try:
        from pytagcloud import create_tag_image, make_tags
    except ImportError:
        if not warned_of_error:
            print("Could not import pytagcloud. Skipping cloud generation")
        return

    # gensim returns a weight between 0 and 1 for each word, while pytagcloud
    # expects an integer word count. So, we multiply by a large number and
    # round. For a visualization this is an adequate approximation.
    words = [(w, int(v*10000)) for w, v in words]
    tags = make_tags(words, maxsize=maxsize)
    create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)
    
# We first identify the most discussed topic, i.e., the one with the
# highest total weight

topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()


# Get the top 64 words for this topic
# Without the argument, show_topic would return only 10 words
words = model.show_topic(max_topic, 64)

# This function will actually check for the presence of pytagcloud and is otherwise a no-op
create_cloud('./ch04/cloud_lda.png', words)









    



(u'united', 0.0075736671970757759)



In [24]:



In [ ]: