LDA Topic Modeling in Python

The tutorial can be found here


In [1]:
from nltk.corpus import twitter_samples, TwitterCorpusReader
import sys
sys.path.append("../bhtsa")
from process_twt import preprocess, get_stopwords, get_slang_dict

fileIds = twitter_samples.fileids()
root = twitter_samples.root
# read some negative tweet data from corpus
negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
# take a look at some of the tweets
for twt in negTwt[:10]:
    print twt


hopeless for tmr :(
Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
@Hegelbon That heart sliding into the waste basket. :(
“@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Dang starting next week I have "work" :(
oh god, my babies' faces :( https://t.co/9fcwGvaki0
@RileyMcDonough make me smile :((
@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln
why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"
Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz

In [2]:
# preprocess them and show them again
import re
processed_twt = [preprocess(twt) for twt in negTwt]
p_twt = []
for twt in processed_twt:
    twt = re.sub('\'m', 'm', twt)
    twt = re.sub('\'t', 't', twt)
    twt = re.sub('\'s', '', twt)
    p_twt.append(twt)
for twt in p_twt[:10]:
    print twt


hopeless for tmr :(
everything in the kids section of ikea is so cute. shame im nearly 19 in 2 months :(
AT_USER that heart sliding into the waste basket. :(
“AT_USER i hate japanese call him "bani" :( :(” me too
dang starting next week i have "work" :(
oh god, my babies' faces :( URL
AT_USER make me smile :((
AT_USER AT_USER work neighbour on motors. asked why and he said hates the updates on search :( URL
why?:("AT_USER sialan:( URL
athabasca glacier was there in 1948 :-( athabasca glacier jasper jaspernationalpark alberta explorealberta … URL

In [3]:
# remove stop words and transfer slangs
import re
stop_words = get_stopwords()
slang_dict = get_slang_dict()
doc_set_removed = []
for twt in p_twt:
    removed = []
    words = twt.split()
    for w in words:
        # strip punctuation
        w = w.strip('\'"?,.')
        # check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        # ignore if it is a stop word
        if w.strip() in slang_dict:
            w = slang_dict[w.strip()]
        if w in stop_words or val is None:
            continue
        else:
            removed.append(w)
    doc_set_removed.append(removed)
print doc_set_removed[1]


[u'kids', u'section', u'ikea', u'cute', u'shame', 'Instant Message', u'nearly', u'months']

In [4]:
# tokenization
doc_set = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for twt in doc_set_removed:
    tokens = doc_set.append(tokenizer.tokenize(' '.join(twt)))
print doc_set[1]


[u'kids', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Message', u'nearly', u'months']

In [5]:
# stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
texts = []
for txt in doc_set:
    texts.append([p_stemmer.stem(i) for i in txt])
print texts[1]


[u'kid', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Messag', u'nearli', u'month']

In [6]:
# construct document matrix
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)
# convert to bag of words
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus[0]


[(0, 1), (1, 1)]
/Users/BohaoHuang/anaconda2/envs/bhplayground/lib/python2.7/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")

In [7]:
# apply lda model, this might take a long time
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word=dictionary, passes=100)

In [8]:
ldamodel.print_topics(num_topics=50, num_words=10)


Out[8]:
[(0,
  u'0.081*"friend" + 0.048*"nice" + 0.045*"left" + 0.033*"money" + 0.033*"hello" + 0.030*"name" + 0.027*"that" + 0.024*"sa" + 0.023*"disappoint" + 0.020*"bet"'),
 (1,
  u'0.146*"wish" + 0.058*"cute" + 0.038*"weather" + 0.035*"cri" + 0.028*"thesi" + 0.017*"beauti" + 0.014*"be" + 0.013*"probabl" + 0.011*"horribl" + 0.010*"one"'),
 (2,
  u'0.155*"didnt" + 0.059*"ice" + 0.045*"cream" + 0.041*"phone" + 0.037*"gone" + 0.026*"anymor" + 0.023*"pretti" + 0.021*"fix" + 0.021*"terribl" + 0.018*"time"'),
 (3,
  u'0.147*"look" + 0.096*"fuck" + 0.087*"tire" + 0.041*"make" + 0.029*"read" + 0.026*"text" + 0.024*"tonight" + 0.022*"say" + 0.021*"avail" + 0.020*"book"'),
 (4,
  u'0.079*"meet" + 0.069*"head" + 0.047*"god" + 0.037*"holiday" + 0.029*"post" + 0.025*"take" + 0.024*"crazi" + 0.023*"fan" + 0.021*"enjoy" + 0.018*"my"'),
 (5,
  u'0.154*"peopl" + 0.041*"die" + 0.036*"job" + 0.035*"sound" + 0.028*"news" + 0.024*"link" + 0.022*"world" + 0.018*"forget" + 0.017*"joke" + 0.017*"hand"'),
 (6,
  u'0.132*"am" + 0.066*"soon" + 0.050*"live" + 0.047*"start" + 0.038*"final" + 0.029*"support" + 0.024*"run" + 0.023*"rest" + 0.017*"music" + 0.017*"short"'),
 (7,
  u'0.056*"hard" + 0.047*"So" + 0.036*"add" + 0.034*"snapchat" + 0.027*"ugli" + 0.026*"travel" + 0.025*"kikmeboy" + 0.024*"gift" + 0.019*"hotel" + 0.018*"kiksex"'),
 (8,
  u'0.065*"talk" + 0.055*"girl" + 0.051*"do" + 0.050*"dream" + 0.036*"album" + 0.028*"ignor" + 0.027*"assfac" + 0.023*"mom" + 0.022*"dad" + 0.020*"aint"'),
 (9,
  u'0.239*"love" + 0.078*"shit" + 0.066*"mayb" + 0.025*"near" + 0.017*"remov" + 0.014*"ran" + 0.013*"toothsensit" + 0.012*"time" + 0.011*"woke" + 0.008*"extra"'),
 (10,
  u'0.114*"hate" + 0.048*"heart" + 0.033*"write" + 0.033*"finish" + 0.033*"caus" + 0.032*"stage" + 0.027*"goodby" + 0.025*"repli" + 0.024*"english" + 0.024*"of"'),
 (11,
  u'0.096*"morn" + 0.057*"dog" + 0.026*"monday" + 0.026*"dead" + 0.025*"stress" + 0.021*"boo" + 0.020*"offic" + 0.018*"excit" + 0.018*"develop" + 0.017*"tea"'),
 (12,
  u'0.070*"xx" + 0.058*"a" + 0.050*"song" + 0.042*"hole" + 0.024*"fair" + 0.023*"throat" + 0.021*"broken" + 0.016*"reason" + 0.014*"inform" + 0.012*"trend"'),
 (13,
  u'0.283*"cant" + 0.131*"sleep" + 0.049*"tomorrow" + 0.043*"play" + 0.031*"doesnt" + 0.027*"ugh" + 0.019*"kikhorni" + 0.017*"liter" + 0.015*"amaz" + 0.015*"time"'),
 (14,
  u'0.155*"bad" + 0.035*"pictur" + 0.026*"mad" + 0.024*"rude" + 0.023*"hurt" + 0.023*"hell" + 0.022*"model" + 0.022*"info" + 0.021*"drop" + 0.018*"test"'),
 (15,
  u'0.167*"sad" + 0.090*"wont" + 0.045*"abl" + 0.037*"wasnt" + 0.023*"time" + 0.019*"awak" + 0.019*"stupid" + 0.016*"inact" + 0.015*"boyfriend" + 0.013*"leav"'),
 (16,
  u'0.146*"tri" + 0.035*"plan" + 0.034*"lunch" + 0.025*"london" + 0.023*"code" + 0.022*"apink" + 0.018*"start" + 0.016*"save" + 0.015*"sign" + 0.015*"ladi"'),
 (17,
  u'0.049*"wake" + 0.044*"hot" + 0.044*"ill" + 0.037*"like" + 0.034*"lucki" + 0.031*"friday" + 0.029*"stuff" + 0.029*"unfollow" + 0.028*"cold" + 0.022*"ahh"'),
 (18,
  u'0.121*"yeah" + 0.051*"leav" + 0.035*"hous" + 0.033*"answer" + 0.027*"chanc" + 0.024*"think" + 0.023*"block" + 0.016*"question" + 0.015*"decid" + 0.015*"twice"'),
 (19,
  u'0.043*"els" + 0.037*"befor" + 0.036*"king" + 0.035*"anyon" + 0.034*"f" + 0.032*"share" + 0.031*"black" + 0.024*"class" + 0.020*"cake" + 0.020*"sale"'),
 (20,
  u'0.246*"feel" + 0.073*"sick" + 0.041*"hurt" + 0.036*"late" + 0.030*"stay" + 0.025*"worst" + 0.017*"time" + 0.015*"there" + 0.012*"soul" + 0.011*"poorli"'),
 (21,
  u'0.053*"busi" + 0.048*"hey" + 0.048*"stream" + 0.046*"true" + 0.043*"couldnt" + 0.035*"listen" + 0.021*"cant" + 0.018*"lie" + 0.015*"outsid" + 0.013*"danc"'),
 (22,
  u'0.333*"Instant" + 0.333*"Messag" + 0.024*"game" + 0.019*"sad" + 0.013*"hungri" + 0.010*"forgot" + 0.009*"gut" + 0.007*"sell" + 0.007*"catch" + 0.006*"suit"'),
 (23,
  u'0.312*"t" + 0.305*"don" + 0.048*"know" + 0.047*"I" + 0.011*"leav" + 0.009*"play" + 0.006*"airport" + 0.006*"jacob" + 0.005*"just" + 0.005*"entir"'),
 (24,
  u'0.055*"asleep" + 0.054*"kik" + 0.045*"lost" + 0.038*"babe" + 0.036*"rememb" + 0.032*"fall" + 0.029*"photo" + 0.027*"mum" + 0.026*"hornykik" + 0.023*"kikmenow"'),
 (25,
  u'0.116*"get" + 0.039*"parti" + 0.026*"sore" + 0.022*"film" + 0.021*"date" + 0.019*"perform" + 0.017*"ohh" + 0.016*"card" + 0.016*"age" + 0.015*"learn"'),
 (26,
  u'0.264*"sorri" + 0.060*"hear" + 0.049*"hi" + 0.042*"weekend" + 0.015*"sister" + 0.015*"bag" + 0.014*"complet" + 0.014*"current" + 0.014*"unfair" + 0.012*"data"'),
 (27,
  u'0.120*"help" + 0.061*"happi" + 0.036*"super" + 0.029*"wow" + 0.025*"harri" + 0.021*"internet" + 0.018*"favorit" + 0.018*"bestfriend" + 0.017*"time" + 0.016*"match"'),
 (28,
  u'0.252*"to" + 0.137*"want" + 0.074*"go" + 0.032*"bye" + 0.028*"hug" + 0.018*"vidcon" + 0.017*"dude" + 0.013*"bed" + 0.012*"got" + 0.009*"chang"'),
 (29,
  u'0.085*"ye" + 0.075*"rain" + 0.072*"poor" + 0.030*"singl" + 0.023*"mine" + 0.020*"told" + 0.019*"broke" + 0.018*"ruin" + 0.018*"issu" + 0.017*"headach"'),
 (30,
  u'0.105*"mean" + 0.075*"kid" + 0.030*"pray" + 0.024*"forev" + 0.024*"episod" + 0.022*"worri" + 0.022*"ate" + 0.021*"concert" + 0.018*"taxi" + 0.018*"pay"'),
 (31,
  u'0.163*"hope" + 0.079*"stop" + 0.049*"movi" + 0.028*"send" + 0.027*"deathmatch" + 0.024*"okay" + 0.023*"cut" + 0.023*"bit" + 0.023*"word" + 0.019*"uk"'),
 (32,
  u'0.498*"miss" + 0.042*"bore" + 0.028*"summer" + 0.012*"jealou" + 0.011*"lot" + 0.009*"starv" + 0.008*"hahaha" + 0.007*"adult" + 0.007*"forev" + 0.006*"phone"'),
 (33,
  u'0.286*"pleas" + 0.227*"follow" + 0.095*"thank" + 0.060*"okay" + 0.042*"love" + 0.038*"believ" + 0.030*"justin" + 0.010*"driver" + 0.009*"rt" + 0.008*"wet"'),
 (34,
  u'0.106*"wait" + 0.051*"cat" + 0.049*"check" + 0.026*"allow" + 0.022*"sent" + 0.020*"offer" + 0.019*"email" + 0.017*"gone" + 0.016*"slow" + 0.014*"appar"'),
 (35,
  u'0.131*"watch" + 0.040*"kill" + 0.039*"k" + 0.038*"what" + 0.038*"f" + 0.036*"haha" + 0.032*"the" + 0.030*"paper" + 0.025*"zayniscomingbackonjuly26" + 0.022*"town"'),
 (36,
  u'0.274*"day" + 0.073*"isnt" + 0.036*"buy" + 0.021*"own" + 0.020*"moment" + 0.018*"join" + 0.017*"sigh" + 0.016*"usual" + 0.016*"finger" + 0.015*"favourit"'),
 (37,
  u'0.064*"time" + 0.060*"damn" + 0.057*"life" + 0.053*"eat" + 0.047*"boy" + 0.027*"shame" + 0.022*"deserv" + 0.022*"annoy" + 0.021*"mention" + 0.018*"bring"'),
 (38,
  u'0.107*"home" + 0.096*"week" + 0.068*"birthday" + 0.043*"account" + 0.038*"close" + 0.037*"car" + 0.036*"real" + 0.020*"size" + 0.019*"hit" + 0.017*"famili"'),
 (39,
  u'0.116*"out" + 0.112*"laugh" + 0.097*"loud" + 0.045*"hair" + 0.044*"food" + 0.017*"till" + 0.016*"time" + 0.015*"met" + 0.011*"van" + 0.010*"return"'),
 (40,
  u'0.098*"aw" + 0.075*"twitter" + 0.056*"wrong" + 0.040*"free" + 0.026*"ticket" + 0.023*"anyway" + 0.014*"guy" + 0.013*"water" + 0.012*"youu" + 0.012*"memori"'),
 (41,
  u'0.113*"snapchat" + 0.084*"night" + 0.064*"havent" + 0.062*"fun" + 0.046*"seen" + 0.020*"kikgirl" + 0.016*"digit" + 0.015*"look" + 0.014*"pass" + 0.013*"found"'),
 (42,
  u'0.081*"aww" + 0.060*"call" + 0.052*"lot" + 0.048*"unfortun" + 0.047*"come" + 0.034*"chang" + 0.024*"ubericecream" + 0.019*"ang" + 0.019*"lone" + 0.018*"insid"'),
 (43,
  u'0.181*"oh" + 0.104*"my" + 0.088*"God" + 0.020*"readi" + 0.018*"fail" + 0.015*"time" + 0.014*"la" + 0.014*"chees" + 0.013*"page" + 0.012*"am"'),
 (44,
  u'0.046*"month" + 0.036*"pain" + 0.033*"off" + 0.033*"a" + 0.032*"my" + 0.027*"sometim" + 0.027*"laugh" + 0.027*"due" + 0.021*"minut" + 0.020*"team"'),
 (45,
  u'0.102*"happen" + 0.070*"hour" + 0.056*"littl" + 0.027*"badli" + 0.025*"time" + 0.023*"visit" + 0.016*"sunday" + 0.015*"saturday" + 0.015*"bare" + 0.013*"wife"'),
 (46,
  u'0.103*"babi" + 0.070*"tweet" + 0.046*"tell" + 0.045*"actual" + 0.036*"onlin" + 0.035*"Right" + 0.035*"now" + 0.034*"huhu" + 0.033*"drive" + 0.023*"yesterday"'),
 (47,
  u'0.100*"guy" + 0.091*"kik" + 0.054*"win" + 0.042*"suck" + 0.036*"serious" + 0.029*"video" + 0.023*"sexi" + 0.018*"indiemus" + 0.016*"kikm" + 0.016*"kikchat"'),
 (48,
  u'0.083*"school" + 0.048*"care" + 0.031*"idea" + 0.024*"sun" + 0.024*"you" + 0.024*"schedul" + 0.022*"shop" + 0.020*"updat" + 0.018*"mind" + 0.018*"confus"'),
 (49,
  u'0.079*"applic" + 0.063*"not" + 0.036*"see" + 0.036*"i" + 0.022*"scare" + 0.019*"have" + 0.019*"half" + 0.018*"hard" + 0.018*"extrem" + 0.016*"hold"')]

In [9]:
import pyLDAvis.gensim
vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis_data)


Out[9]:

In [ ]: