In [2]:
from gensim import corpora, models
import gensim
import gzip

import os.path
if not os.path.isfile('news.gz'):
    print('Downloading news Dataset...')
    !wget http://acube.di.unipi.it/repo/news.gz

news_corpus = {}
texts = []
categories = set()

with gzip.open('news.gz', 'rb') as news_raw:
    for line in news_raw:
        title = line.lower().strip()
        description = news_raw.readline().lower().strip()
        url = news_raw.readline().lower().strip()
        sequence = news_raw.readline().lower().strip()
        timestamp = news_raw.readline().lower().strip()
        publisher = news_raw.readline().lower().strip()
        category = news_raw.readline().lower().strip()
        news_corpus[sequence] = {'description': description,
                                 'url': url,
                                 'title': title,
                                 'timestamp': timestamp,
                                 'publisher': publisher,
                                 'category': category}
        # Skipping empty line meant to segregate records
        news_raw.readline().lower().strip()
        tokens = [token for token in gensim.utils.simple_preprocess(title + '\n' + description) if token not in gensim.parsing.preprocessing.STOPWORDS]
        texts.append(tokens)
        categories.add(category)

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=len(categories), id2word = dictionary, passes=30)


/home/baali/pip3/lib/python3.4/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")

In [7]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)


Out[7]:

In [3]:
# Saliency related functions adapted from https://github.com/StanfordHCI/termite/blob/master/pipeline/compute_saliency.py
def computeTopicInfo( ldamodel, dictionary ):
    topn = len(dictionary.items())//100 # .1 % of total dictionary terms
    topic_info = []
    for index in range(ldamodel.num_topics):
        topic_weight = sum([prob_score for term, prob_score in ldamodel.show_topic(index, topn=topn) ])
        topic_info.append( {
            'topic' : ldamodel.show_topic(index, topn=topn),
            'weight' : topic_weight
        } )

    return topic_info

def getNormalized( counts ):
    """Rescale a list of counts, so they represent a proper probability distribution."""
    tally = sum( counts )
    if tally == 0:
        probs = [ d for d in counts ]
    else:
        probs = [ d / tally for d in counts ]
    return probs

def computeTermFreq(corpus):
    from collections import Counter
    term_freq = Counter()
    for doc in corpus:
        for (term, freq) in doc:
            term_freq[term] += freq
    return term_freq

def computeTermInfo(ldamodel, dictionary, corpus):
    """Iterate over the list of terms. Compute frequency, distinctiveness, saliency."""
    topic_info = computeTopicInfo(ldamodel, dictionary)
    topic_marginal = getNormalized( [ d['weight'] for d in topic_info ] )
    term_freq = computeTermFreq(corpus)
    term_info = []
    for (tid, term) in dictionary.items():
        # This is not giving expected results or maybe I am using it
        # wrong(?)
        # counts = ldamodel.get_term_topics(tid, minimum_probability=0.00001)
        # if not counts:
        #     print('skippping %s as no term_topics were returned for it' %term)
        #     continue
        # probs = [0 for index in range(ldamodel.num_topics)]
        # for (index, prob) in counts:
        #     probs[index] = prob
        frequency = term_freq[tid]
        probs = []
        for index in range(ldamodel.num_topics):
            probs.append(ldamodel.expElogbeta[index][tid])
        probs = getNormalized( probs )
        distinctiveness = getKLDivergence( probs, topic_marginal )
        saliency = frequency * distinctiveness
        term_info.append({
            'term' : term,
            'saliency' : saliency,
            'frequency' : frequency,
            'distinctiveness' : distinctiveness,
            'rank' : None,
            'visibility' : 'default'
        })
    return term_info

def getKLDivergence( P, Q ):
    """Compute KL-divergence from P to Q"""
    import math
    divergence = 0
    assert len(P) == len(Q)
    for i in range(len(P)):
        p = P[i]
        q = Q[i]
        assert p >= 0
        assert q >= 0
        if p > 0:
            divergence += p * math.log( p / q )
    return divergence

In [4]:
topic_info = computeTopicInfo(ldamodel, dictionary)
term_info = computeTermInfo(ldamodel, dictionary, corpus)

topic_info = sorted( topic_info, key = lambda topic_weight : -topic_weight['weight'] )
term_info = sorted( term_info, key = lambda term_freq : -term_freq['saliency'] )
for i, element in enumerate( term_info ):
    element['rank'] = i

for index in range(ldamodel.num_topics):
    topics = ldamodel.get_topic_terms(index, topn=40)
    terms = [ldamodel.id2word[topic_id] for (topic_id, prob) in topics]
    # Reorder terms based on saliency rank
    reordered_terms = sorted( [term for term in term_info if term['term'] in terms], key = lambda term_freq : term_freq['rank'] )
    print ('Terms for topic %d order:' %index)
    print ('{}'.format(' '.join(terms[:20])))
    print ('{}'.format(' '.join([term['term'] for term in reordered_terms[:20]])))


Terms for topic 0 order:
new online network google company old social like service internet home school university pakistani apple time web search facebook mobile
new million company network google internet online service apple social like school home facebook old search video detroit twitter pakistani
Terms for topic 1 order:
nuclear said japan power plant coast water crisis tuesday river friday state wednesday saturday festival south film thursday fourth radiation
said nuclear japan power state officials wednesday plant earthquake saturday coast water sunday south tuesday thursday friday crisis north river
Terms for topic 2 order:
japan earthquake companies economic world march fund april tsunami said week recovery food leaders euro wedding presidential month missing monetary
said japan world earthquake prices march economic companies country year fund economy friday like tsunami wedding recovery april global east
Terms for topic 3 order:
said federal year billion state million wednesday bank government percent new market company oil group tuesday financial thursday expected deal
said new government billion federal bank percent state market million oil wednesday company financial group deal prices largest reported sales
Terms for topic 4 order:
game new season team league win series time final second players night victory year coach play scored games york sunday
game new season team league series players victory win games scored coach wednesday nfl points final night tournament play boston
Terms for topic 5 order:
new open world study year according number french people risk women crackdown week health states years research united suggests champion
new open study people world number french according round second united states women york risk health year tuesday friday research
Terms for topic 6 order:
said president people monday friday thursday killed wednesday forces police tuesday government officials security city al united sunday state saturday
said president killed police government forces security leader state city people officials al chief military obama killing wednesday bin court