In [1]:
import gensim
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from itertools import *
import csv
import pyodbc
import re
import pyLDAvis.gensim

In [ ]:
# Step 1: fetch tweets from database

In [ ]:
connection = pyodbc.connect(dsn='Twitter')
cursor = connection.cursor()
cursor.execute("SELECT tweet_text FROM tweets")

documents = []

for row in cursor.fetchall():
    documents.append(row)
    
connection.close()

In [ ]:


In [ ]:
# Step 2: preprocess tweets

In [ ]:
# remve RT, @ and urls

document_process = []

for i in documents:
    for j in i:
        j = re.sub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", j)
        j = re.sub("@\\w+", " ", j)
        j = re.sub("#\\w+", " ", j)
        j = re.sub("(f|ht)(tp)(s?)(://)\\S+\\s*", " ", j)
        j = re.sub("[ \t]{2,}", " ", j)
        j = re.sub("^\\s+|\\s+$", "", j)
        document_process.append(j)

In [ ]:
# lemmatize

documents_lemma = [gensim.utils.lemmatize(i) for i in document_process]

In [6]:
# find associated words

model = gensim.models.Word2Vec(documents_lemma, size=100, window=5, min_count=3, workers=4)

model.most_similar("pokemon/NN")


Out[6]:
[('catch/VB', 0.9861735701560974),
 ('go/VB', 0.9795925617218018),
 ('stay/VB', 0.9724176526069641),
 ('win/VB', 0.9718842506408691),
 ('backpack/NN', 0.9690099358558655),
 ('powerbank/NN', 0.968267023563385),
 ('time/NN', 0.9642168283462524),
 ('oneplus/JJ', 0.9622034430503845),
 ('cop/NN', 0.962091863155365),
 ('gotta/VB', 0.9618346095085144)]

In [ ]:
# remove stopwords and words with only one letter

texts = [[words for words in line if words.split('/')[0] not in STOPWORDS and len(words.split('/')[0]) >= 2] \
         for line in documents_lemma]

In [ ]:
# generate n-gram

from nltk import ngrams 
 
terms_bigram = [list(ngrams(i,2))+i for i in texts]

texts_full = []

for i in terms_bigram:
    sublist = []
    for j in i:
        if isinstance(j, tuple):
            sublist.append('_'.join(j))
        else:
            sublist.append(j)
    texts_full.append(sublist)

In [9]:
# find most frequent words

from collections import Counter

words = [item for sublist in texts_full for item in sublist]
c = Counter(words)

c.most_common()[:10] # top 10


Out[9]:
[('catch/VB', 1101),
 ('pokemon/NN', 1085),
 ('time/NN', 687),
 ('legendary/JJ', 528),
 ('play/VB', 309),
 ('person/NN', 305),
 ('phone/NN', 148),
 ('new/JJ', 143),
 ('pokeball/NN', 127),
 ('night/NN', 121)]

In [ ]:
# remove frequent and rare words

tokens_rare = {k:v for k,v in c.iteritems() if v <= 50}
tokens_frequent = {k:v for k,v in c.iteritems() if v > 4000}
texts_final = [[word for word in text if tokens_rare.get(word,-1) == -1 and tokens_frequent.get(word,-1) == -1] \
               for text in texts_full]

In [ ]:
len(sum(texts_final, []))

In [ ]:


In [ ]:
#Step 3:  build topic models

In [ ]:
# create corpus

id2word = corpora.Dictionary(texts_full)
mm = [id2word.doc2bow(text) for text in texts_full]

# id2word.save('project.dict')
# corpora.MmCorpus.serialize('project.mm', mm) # store to disk, for later use
# id2word = corpora.Dictionary.load('project.dict')
# mm = corpora.MmCorpus('project.mm')

In [ ]:
# calculate TF-IDF

tfidf = gensim.models.tfidfmodel.TfidfModel(mm)
corpus = tfidf[mm]

In [ ]:
# build model

lda = models.ldamodel.LdaModel(corpus, id2word=id2word, num_topics=50, update_every=1, chunksize=500, passes=20)

# lda.save('pokemongo_model.lda')
# lda = gensim.models.LdaModel.load('pokemongo_model.lda')

In [15]:
# print topic models

lda.print_topics(num_topics=50, num_words=5)


Out[15]:
[(0,
  u'0.031*best/JJ + 0.029*bring/VB + 0.024*love/NN + 0.018*church/NN + 0.018*best/JJ_person/NN'),
 (1,
  u'0.168*meet/VB + 0.082*gotta/VB + 0.081*gotta/VB_catch/VB + 0.049*worst/JJ_pokemon/NN + 0.049*worst/JJ'),
 (2,
  u'0.092*playing/NN + 0.086*public/NN + 0.071*playing/NN_public/NN + 0.049*man/NN + 0.039*catch/VB_man/NN'),
 (3,
  u'0.132*trainer/NN_campus/NN + 0.132*serve/VB_pokemon/NN + 0.132*guard/NN_serve/VB + 0.121*guard/NN + 0.025*pokemon/NN'),
 (4,
  u'0.049*person/NN + 0.027*lame/JJ + 0.026*person/NN_lame/JJ + 0.026*lame/JJ_play/VB + 0.019*value/NN'),
 (5,
  u'0.132*way/NN_woman/NN + 0.132*woman/NN_heart/NN + 0.131*heart/NN + 0.130*woman/NN + 0.102*way/NN'),
 (6,
  u'0.170*meet/VB_friday/NN + 0.170*official/NN_meet/VB + 0.157*official/NN + 0.014*thank/NN + 0.010*latest/JJ'),
 (7,
  u'0.017*fan/NN + 0.015*head/NN + 0.012*cute/JJ + 0.010*charmander/NN + 0.009*catch/VB_charmander/NN'),
 (8,
  u'0.073*rt/VB + 0.043*feel/VB + 0.038*pok\xe9mon/NN + 0.025*day/NN + 0.022*catch/VB_pok\xe9mon/NN'),
 (9,
  u'0.049*help/VB + 0.048*chart/NN + 0.042*rarity/NN_chart/NN + 0.042*rarity/NN + 0.042*chart/NN_help/VB'),
 (10,
  u'0.130*know/VB + 0.045*pokemon/NN + 0.032*look/VB + 0.023*walk/VB + 0.015*eating/NN'),
 (11,
  u'0.028*definitely/RB + 0.026*delete/VB + 0.025*risk/NN + 0.024*worth/NN_risk/NN + 0.024*delete/VB_phone/NN'),
 (12,
  u'0.054*god/NN + 0.018*throw/VB + 0.013*valor/NN + 0.013*throw/VB_phone/NN + 0.012*team/NN_valor/NN'),
 (13,
  u'0.041*think/VB + 0.027*player/NN + 0.018*country/NN + 0.017*security/NN + 0.015*bury/VB'),
 (14,
  u'0.029*real/JJ + 0.029*world/NN + 0.022*retweet/NN + 0.021*follower/NN + 0.019*giveaway/NN'),
 (15,
  u'0.043*catch/VB_pokemon/NN + 0.026*weekend/NN + 0.024*feeling/NN + 0.022*pokemon/NN_feeling/NN + 0.021*tell/VB'),
 (16,
  u'0.043*rt/JJ + 0.043*ready/JJ + 0.042*ready/JJ_catch/VB + 0.042*oneplus/JJ + 0.042*stay/VB_pokemon/NN'),
 (17,
  u'0.081*new/JJ + 0.078*reason/NN + 0.077*parent/NN + 0.077*worry/VB + 0.076*new/JJ_reason/NN'),
 (18,
  u'0.097*know/VB_live/VB + 0.097*pokemon/NN_rat/NN + 0.097*live/VB_pokemon/NN + 0.097*rat/NN_bird/NN + 0.070*bird/NN'),
 (19,
  u'0.044*thing/NN + 0.035*play/NN + 0.014*work/VB + 0.013*talk/NN + 0.012*lead/VB'),
 (20,
  u'0.019*hour/NN + 0.019*year/NN + 0.018*past/JJ + 0.017*michelle/NN + 0.017*obama/NN'),
 (21,
  u'0.036*game/NN + 0.016*fun/NN + 0.012*lot/NN + 0.010*post/NN + 0.009*facebook/NN'),
 (22,
  u'0.117*dead/JJ + 0.024*ve/NN + 0.023*need/VB + 0.023*right/RB + 0.019*news/NN'),
 (23,
  u'0.138*time/NN_pokemon/NN + 0.137*catch/VB_legendary/JJ + 0.137*legendary/JJ + 0.134*pokemon/NN_catch/VB + 0.124*time/NN'),
 (24,
  u'0.052*team/NN + 0.035*app/NN + 0.029*stop/VB + 0.024*account/NN + 0.020*google/NN'),
 (25,
  u'0.122*server/NN + 0.063*pack/NN + 0.039*start/VB + 0.037*end/VB + 0.034*bout/NN'),
 (26,
  u'0.122*wild/JJ + 0.121*appear/VB + 0.118*wild/JJ_appear/VB + 0.028*capture/VB + 0.017*ft/NN'),
 (27,
  u'0.047*hit/VB_ground/NN + 0.047*tap/VB_hit/VB + 0.047*ground/NN_rt/NN + 0.047*fall/VB_pokeball/NN + 0.047*throw/NN'),
 (28,
  u'0.156*campus/NN + 0.038*today/NN + 0.018*week/NN + 0.014*incredible/JJ + 0.014*care/VB'),
 (29,
  u'0.066*come/VB + 0.046*interstate/NN + 0.023*android/NN + 0.018*iphone/NN + 0.012*download/NN'),
 (30,
  u'0.029*hear/VB + 0.017*complain/VB + 0.016*person/NN_complain/VB + 0.016*hear/VB_person/NN + 0.016*complain/VB_love/NN'),
 (31,
  u'0.151*pokemon/NN_trainer/NN + 0.026*good/JJ + 0.021*love/VB + 0.016*play/VB_pokemon/NN + 0.013*need/NN'),
 (32,
  u'0.162*catch/VB_pizzachu/NN + 0.162*pizzachu/NN + 0.062*catch/VB + 0.017*tomorrow/NN + 0.010*enter/VB_fol/JJ'),
 (33,
  u'0.022*wait/VB + 0.021*fucking/JJ + 0.015*datum/NN + 0.012*cool/JJ + 0.011*lure/NN'),
 (34,
  u'0.150*friday/NN + 0.048*kid/NN + 0.042*understand/VB + 0.037*kid/NN_understand/VB + 0.030*privacy/NN'),
 (35,
  u'0.152*trainer/NN + 0.015*little/JJ + 0.013*ll/VB + 0.012*neighborhood/NN + 0.011*eye/NN'),
 (36,
  u'0.032*text/NN + 0.030*vibrate/NN + 0.030*vibrate/NN_text/NN + 0.030*phone/NN_vibrate/NN + 0.029*phone/NN'),
 (37,
  u'0.026*true/JJ + 0.019*im/VB + 0.014*better/JJ + 0.014*download/VB + 0.013*obvious/JJ'),
 (38,
  u'0.083*law/NN_try/VB + 0.083*break/VB_law/NN + 0.082*catch/VB_safely/RB + 0.082*law/NN + 0.081*safely/RB'),
 (39,
  u'0.049*guy/NN + 0.013*music/NN + 0.012*smart/JJ + 0.010*hard/RB + 0.008*music/NN_news/NN'),
 (40,
  u'0.014*great/JJ + 0.014*gonna/VB + 0.013*talk/VB + 0.011*decision/NN + 0.007*australia/NN'),
 (41,
  u'0.118*cop/NN + 0.117*cop/NN_ain/RB + 0.117*ain/RB + 0.117*ain/RB_time/NN + 0.062*time/NN'),
 (42,
  u'0.030*level/NN + 0.019*park/NN + 0.018*canada/NN + 0.017*peta/NN + 0.016*honestly/RB'),
 (43,
  u'0.057*polouse/NN + 0.055*polouse/NN_tweeting/NN + 0.055*tweeting/NN + 0.030*spot/VB + 0.019*pokemon/NN_walk/VB'),
 (44,
  u'0.070*life/NN + 0.053*save/VB_life/NN + 0.049*save/VB + 0.049*rt/NN_save/VB + 0.046*rt/NN'),
 (45,
  u'0.025*tonight/NN + 0.019*police/NN + 0.015*okay/JJ + 0.014*yes/RB + 0.013*town/NN'),
 (46,
  u'0.159*ask/VB_gym/NN + 0.132*ask/VB + 0.089*play/VB + 0.071*night/NN + 0.071*play/VB_night/NN'),
 (47,
  u'0.037*welcome/VB + 0.036*pick/VB + 0.036*miss/VB_pokemon/NN + 0.036*pokeball/NN + 0.035*tap/VB_welcome/VB'),
 (48,
  u'0.145*rat/NN + 0.014*place/NN + 0.010*raise/VB + 0.010*augment/VB + 0.010*augment/VB_reality/NN'),
 (49,
  u'0.274*gym/NN + 0.034*wrong/JJ + 0.031*want/VB + 0.025*fuck/VB + 0.013*leader/NN')]

In [16]:
# visualize topic models

pyLDAvis.display(pyLDAvis.gensim.prepare(lda, corpus, id2word))


Out[16]:

In [ ]: