In [1]:
#!pip install --ignore-installed --upgrade pandas

In [91]:
import pandas as pd
import numpy as np
import gensim
from sklearn.externals import joblib
from gensim import corpora, utils, similarities
from gensim.models.wrappers.dtmmodel import DtmModel
from collections import defaultdict, Counter
import sklearn.metrics
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.ldamulticore import LdaMulticore
import nltk
import pyLDAvis
import pyLDAvis.gensim
#nltk.download()

In [29]:
# read n-grams
data = joblib.load('data/ngrams')
# reset index
data.reset_index(drop=True, inplace=True)

In [30]:
# remove n-grams that only occur once
data['reviews_mono'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_mono'].items() if int(item[1])>1}, axis=1)
data['contents_mono'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_mono'].items() if int(item[1])>1}, axis=1)

data['reviews_bi'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_bi'].items() if int(item[1])>1}, axis=1)
data['contents_bi'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_bi'].items() if int(item[1])>1}, axis=1)

data['reviews_tri'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_tri'].items() if int(item[1])>1}, axis=1)
data['contents_tri'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_tri'].items() if int(item[1])>1}, axis=1)

In [31]:
# join bi-grams and tri-grams with a space
data['reviews_bi'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['reviews_bi'].items()}, axis=1)
data['contents_bi'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['contents_bi'].items()}, axis=1)
data['reviews_tri'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['reviews_tri'].items()}, axis=1)
data['contents_tri'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['contents_tri'].items()}, axis=1)

In [32]:
# combine n-grams
data['ngrams'] = data.apply(lambda row: Counter(row['reviews_mono'])+Counter(row['contents_mono'])+Counter(row['reviews_bi'])+Counter(row['contents_bi'])+Counter(row['reviews_tri'])+Counter(row['contents_tri']), axis=1)

In [33]:
data.head()


Out[33]:
title year reviews_mono contents_mono reviews_bi contents_bi reviews_tri contents_tri ngrams
0 Dead Awake 2016 {'actual': 4, 'sleep': 12, 'paralys': 3, 'prob... {'dead': 2, 'awak': 2, 'supernatur': 2, 'jocel... {'actual like': 2, 'good job': 2, 'good horror... {} {'nightmar elm street': 4} {} {'actual': 4, 'sleep': 12, 'paralys': 3, 'prob...
1 A Good American 2015 {'situat': 2, 'read': 2, 'documentari': 2, 'sn... {'documentari': 3, 'work': 2, 'whistleblow': 3... {} {'good american': 2, 'attack film': 2, 'said f... {} {} {'situat': 2, 'read': 2, 'documentari': 5, 'sn...
2 Hard Tide 2015 {'watch': 4, 'decid': 2, 'girl': 4, 'start': 2... {'hard': 3, 'tide': 3, '2015': 2, 'nathanael':... {'accident kill': 2, 'thing life': 2, 'old gir... {'drug deal': 2, 'girl play': 2, 'take care': ... {'year old girl': 2} {} {'watch': 4, 'decid': 2, 'girl': 4, 'start': 2...
3 Carrie Pilby 2016 {'geniu': 2, 'colleg': 2, 'adulthood': 3, 'the... {'carri': 5, 'pilbi': 6, 'susan': 3, 'johnson'... {'see film': 2, 'base book': 2, 'love movi': 2... {'princip photographi': 2, 'releas march': 2, ... {'live new york': 2, 'intern film festiv': 2, ... {'intern film festiv': 2, 'toronto intern film... {'geniu': 2, 'colleg': 2, 'adulthood': 3, 'the...
4 A Dark Song 2016 {'writer': 2, 'felt': 2, 'review': 7, 'mere': ... {'dark': 6, 'song': 6, '2016': 5, 'irish': 3, ... {'watch film': 5, 'fast forward': 2, 'thing st... {'rotten tomato': 2, 'film festiv': 2} {'best ive seen': 2, 'dont wast time': 2, 'two... {} {'writer': 2, 'felt': 2, 'review': 9, 'mere': ...

LDA


In [8]:
# create a dictionary indexing the unique terms:
keys = [list(i.keys()) for i in data['ngrams']]
dictionary = corpora.Dictionary(keys)
dictionary.save('idtowords.dict')

In [9]:
def get_key(v):
    """ find key based on value"""
    for key, value in dictionary.items():
        if value==v:
            return key
# vectorize function        
get_key_ = np.vectorize(get_key)

In [ ]:
# map n-grams with keys
data['tokens'] = data.apply(lambda row: list(zip(get_key_(list(row['ngrams'].keys())), row['ngrams'].values())), axis=1)
## Create a sparsely formatted corpus:
corpus = list(data['tokens'])
# store to disk
corpora.MmCorpus.serialize('corpus.mm', corpus)

In [ ]:
# Specify a number of topics:
K = 50
# Fit the LDA model (100 topics, 10 passes takes about 1/2 hour, 3 topics 3 passes takes 5 min):
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=K, id2word = dictionary, passes=20, alpha=.1/K, eta=.1/K)

# Save the model object for visualization:
ldamodel.save('wiki.model')

In [16]:
corpus = corpora.MmCorpus('corpus.mm')

lda_mult = gensim.models.ldamodel.LdaModel.load('wiki.model')

In [25]:
print(corpus)


MmCorpus(1549 documents, 86042 features, 1973414 non-zero entries)

In [17]:
# Collect phi matrix of topic word proportions, theta matrix of document topic proportions, f_v of word frequencies
# p_v of term probabilities, V_n of unique terms in each document, W_v of document lengths, f_k of number of words 
# used in each topic, and p_k probabilities of each topic occurring, and bayes probabilities of a topic given a word,
# and a vocab list of terms:
phi_kv = np.zeros((K, len(dictionary)))
theta_nk = np.zeros((len(corpus), K))
for k in range(0,K):
    phi_kv[k,:] = [word_prob[1] for word_prob in lda_mult.get_topic_terms(k, len(dictionary))]

In [18]:
ls = np.zeros([len(corpus),1])
for n in range(0,len(corpus)):
    topic_dists = ldamodel.get_document_topics(corpus[n], minimum_probability=0)
    for topic_dist in topic_dists:
        theta_nk[n, topic_dist[0]] = topic_dist[1]

In [19]:
counts = Counter()
for i in range(data.shape[0]):
    counts = Counter(data.loc[i, 'ngrams']) + counts
f_v = np.array([counts[dictionary[word_ind]] for word_ind in range(len(dictionary))])
p_v = f_v/sum(f_v)
V_n = np.array([len(doc) for doc in corpus])
W_n = np.array([sum(word[1] for word in doc) for doc in corpus]) 
f_k = [sum(theta_nk[:,k]*W_n) for k in range(K)]
p_k = f_k/sum(f_k)
bayes_kv = np.zeros((K, len(dictionary)))
for k in range(K):
    bayes_kv[k,:] = phi_kv[k,:]*p_k[k]/p_v
vocab = [dictionary[i] for i in range(len(dictionary))]

In [27]:
data2 = {'topic_term_dists': phi_kv, 
            'doc_topic_dists': theta_nk,
            'doc_lengths': W_n,
            'vocab': vocab,
            'term_frequency': f_v}

In [28]:
movies_vis_data = pyLDAvis.prepare(**data2)


C:\Users\yanxi\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]

In [22]:
pyLDAvis.display(movies_vis_data)


Out[22]:

In [143]:
# Show top words from each topic:
ldamodel.show_topics(num_topics=-1, num_words=10, log=False, formatted=False)


Out[143]:
[(0,
  [('v', 0.007955280346320124),
   ('v vendetta', 0.004144908760715557),
   ('govern', 0.0039497779514525416),
   ('polit', 0.0030598583835260576),
   ('natali portman', 0.00274356719684559),
   ('movi', 0.0026963305290861406),
   ('mask', 0.0026219933346697754),
   ('like', 0.002600213288089462),
   ('film', 0.002598547801420312),
   ('evey', 0.00233786014175698)]),
 (1,
  [('christin', 0.0054148832441379855),
   ('car', 0.005099247638184713),
   ('bee', 0.004374876310619607),
   ('hollywood', 0.003769244952787116),
   ('norma', 0.003685903875891477),
   ('film', 0.0035396839571629514),
   ('play', 0.0034532419719825224),
   ('like', 0.003006817590549787),
   ('joe', 0.0027863310378728715),
   ('charact', 0.002626056105831778)]),
 (2,
  [('titan', 0.005680310211267982),
   ('film', 0.0028832828125581397),
   ('ship', 0.002769335523053863),
   ('movi', 0.0026100650367330194),
   ('like', 0.002396302580978527),
   ('rose', 0.0022903707071592443),
   ('stori', 0.002272656109318733),
   ('love', 0.0022155606981555795),
   ('end', 0.0021130612735211726),
   ('time', 0.002038801692710049)]),
 (3,
  [('ali', 0.013036462422555723),
   ('fight', 0.010778354028623415),
   ('game', 0.009857577354389545),
   ('prison', 0.009009774503708014),
   ('smith', 0.008214679826916158),
   ('box', 0.006012488129598709),
   ('sport', 0.004960804341419005),
   ('bronson', 0.003758757002165717),
   ('boxer', 0.0035655344246424497),
   ('film', 0.003404285206464824)]),
 (4,
  [('anim', 0.006708159880499993),
   ('ant', 0.005824585155601335),
   ('sin citi', 0.005491102243266934),
   ('whale', 0.005087952326282179),
   ('trainer', 0.004287197826258742),
   ('antz', 0.003984624271223198),
   ('seaworld', 0.003838041850222509),
   ('sequel', 0.0038206792556332403),
   ('dinosaur', 0.0038054351370287527),
   ('orca', 0.0037350599908533085)]),
 (5,
  [('vampir', 0.005778388104614601),
   ('book', 0.005329903168435582),
   ('forrest gump', 0.004790927330114669),
   ('bella', 0.004763335842653248),
   ('forrest', 0.004280145856439312),
   ('twilight', 0.004185467350262833),
   ('read book', 0.0038415819914584762),
   ('edward', 0.00356626853885539),
   ('love', 0.003035315387310186),
   ('movi', 0.0029931850298828616)]),
 (6,
  [('hellboy', 0.008332016438866963),
   ('prestig', 0.006166454487149376),
   ('nolan', 0.005114944801944865),
   ('magician', 0.004819185061937136),
   ('trick', 0.0037530197337008997),
   ('borden', 0.003713367758406082),
   ('angier', 0.0036655493748938037),
   ('magic', 0.0029918735001182827),
   ('hugh jackman', 0.0028553427250032682),
   ('movi', 0.0027512524431178302)]),
 (7,
  [('famili', 0.004871232343783099),
   ('gay', 0.004710296366574546),
   ('film', 0.004472581437784102),
   ('play', 0.004418507399518716),
   ('mother', 0.004142923109277789),
   ('saroo', 0.0040370042372237964),
   ('like', 0.0028763516893109604),
   ('perform', 0.0028724326832724437),
   ('life', 0.002869270078112818),
   ('stori', 0.0025942942890823796)]),
 (8,
  [('film', 0.003182947744707301),
   ('movi', 0.0029743817824240382),
   ('stori', 0.00269423854026585),
   ('like', 0.0026469866668593785),
   ('babel', 0.002583359815066404),
   ('charact', 0.002506008120576769),
   ('play', 0.0023537500621321865),
   ('end', 0.002187599558623848),
   ('see', 0.0020431436611131476),
   ('perform', 0.0019251712214531924)]),
 (9,
  [('dori', 0.007238404276937817),
   ('bulli', 0.004196952976143501),
   ('pixar', 0.003764623983628511),
   ('find nemo', 0.0035518573273955532),
   ('like', 0.0035168488034364315),
   ('find dori', 0.003512769801145484),
   ('parent', 0.0033954261840042506),
   ('movi', 0.003268538284001804),
   ('roy', 0.002877063506112276),
   ('film', 0.0028492705367679757)]),
 (10,
  [('kubrick', 0.006804821633016505),
   ('shine', 0.005336996671421136),
   ('jack', 0.004550829150885462),
   ('zodiac', 0.004283681344668684),
   ('hotel', 0.003584846264384484),
   ('milk', 0.003243012544505778),
   ('film', 0.0029444359917187145),
   ('stanley kubrick', 0.0028447927463397062),
   ('jack nicholson', 0.0024777512250424645),
   ('gremlin', 0.002424038098060682)]),
 (11,
  [('jacki brown', 0.009789408975939795),
   ('tarantino', 0.009477756934326974),
   ('jacki', 0.007438363426038424),
   ('lincoln', 0.006265647757600007),
   ('pulp fiction', 0.006209332659891951),
   ('l jackson', 0.005078665368413296),
   ('samuel l', 0.004821832245176558),
   ('samuel l jackson', 0.004711098012978784),
   ('max', 0.0045099438618329555),
   ('de niro', 0.004483996555210253)]),
 (12,
  [('captain america', 0.005695372700869737),
   ('civil war', 0.005308978947759966),
   ('aveng', 0.004945143903658069),
   ('kung fu', 0.004028334578408415),
   ('marvel', 0.003994589414333738),
   ('po', 0.003675142780359885),
   ('sleepi hollow', 0.003506223421143844),
   ('fight', 0.003363776659252027),
   ('comic book', 0.0030463768317284205),
   ('superhero', 0.002958227498215572)]),
 (13,
  [('catwoman', 0.004330710837658594),
   ('high school', 0.0037624663106380998),
   ('like', 0.0033428221137879926),
   ('movi', 0.00316485525293407),
   ('mean girl', 0.002932760614507737),
   ('school', 0.0026947885494286302),
   ('heather', 0.002690650642730464),
   ('girl', 0.0026721680353849527),
   ('film', 0.0024959957802726035),
   ('mcdonald', 0.002341104392990332)]),
 (14,
  [('bridget', 0.004385552442432154),
   ('film', 0.0033410275986778455),
   ('like', 0.003043151646461144),
   ('play', 0.0030197648117905895),
   ('movi', 0.0029609086488485746),
   ('hors', 0.002868111964265957),
   ('see', 0.002646221308311686),
   ('love', 0.002535663081906408),
   ('woman', 0.0025130254509748303),
   ('charact', 0.002475704920240168)]),
 (15,
  [('war', 0.015269376541439816),
   ('soldier', 0.008100794828382314),
   ('patton', 0.006978246899944134),
   ('militari', 0.006223439300399159),
   ('american', 0.00471969209504441),
   ('gener', 0.00467143056020317),
   ('film', 0.004659484058133399),
   ('german', 0.0045144060503484506),
   ('battl', 0.003714401343054082),
   ('armi', 0.0037143526184779878)]),
 (16,
  [('laura', 0.005935045411490199),
   ('hitchcock', 0.0052787997250031485),
   ('coralin', 0.00429735812518251),
   ('rear window', 0.004117379171829716),
   ('jeff', 0.004000137791538871),
   ('ip man', 0.003875475327575932),
   ('apart', 0.0037956646242023532),
   ('murder', 0.003329953478896294),
   ('film', 0.0031164037511053275),
   ('martial art', 0.00310329864376009)]),
 (17,
  [('ture', 0.007635823062961964),
   ('philomena', 0.004420324493581919),
   ('alan ture', 0.003958960293016288),
   ('work', 0.003518013090416162),
   ('film', 0.003309523099873955),
   ('stori', 0.003267268451338589),
   ('life', 0.003264334581473245),
   ('imit game', 0.0032015354083552946),
   ('machin', 0.003056031339748636),
   ('homosexu', 0.002948265933330862)]),
 (18,
  [('hot fuzz', 0.005180747162951317),
   ('shaun dead', 0.004565878777166895),
   ('comedi', 0.0039050082948631644),
   ('simon pegg', 0.0034962789151002944),
   ('film', 0.0034905007531474417),
   ('dog', 0.0034741689356748203),
   ('funni', 0.0032953482295817547),
   ('angel', 0.003173548821897456),
   ('zach', 0.003150192042237165),
   ('cop', 0.0029809484244504186)]),
 (19,
  [('documentari', 0.02396639947689443),
   ('food', 0.007662657256561189),
   ('interview', 0.0058159849224356155),
   ('penguin', 0.005708121762974308),
   ('peopl', 0.005014368980137736),
   ('inform', 0.004726211601555412),
   ('eat', 0.004551377380680828),
   ('health', 0.0037815306027832816),
   ('industri', 0.0034897958686740532),
   ('world', 0.003381289264751302)]),
 (20,
  [('arthur', 0.008604937178132313),
   ('king arthur', 0.005324347593176784),
   ('roman', 0.00503144648096148),
   ('zootopia', 0.004727647620254457),
   ('knight', 0.0036515632345116905),
   ('werewolf', 0.003103208868959179),
   ('saxon', 0.003092936855948923),
   ('david', 0.003045289920942378),
   ('battl', 0.0029894259220863096),
   ('like', 0.002856676621182911)]),
 (21,
  [('tarantino', 0.010326982378026511),
   ('pulp fiction', 0.008448483681783385),
   ('nazi', 0.004034266433791184),
   ('quentin tarantino', 0.0036879631137541114),
   ('scene', 0.0030752592393376126),
   ('movi', 0.003003454824814636),
   ('dialogu', 0.0029345415230789566),
   ('john travolta', 0.002927506544799834),
   ('film', 0.002811308451648073),
   ('violenc', 0.0026107744067933535)]),
 (22,
  [('queen', 0.006861231616148617),
   ('mason', 0.004799551174209972),
   ('boyhood', 0.004142493085849369),
   ('life', 0.004007295321862994),
   ('famili', 0.0031967201657772453),
   ('linklat', 0.0031296218804889447),
   ('film', 0.00303526359522179),
   ('grow', 0.0026143514582836636),
   ('peopl', 0.0025123874678472822),
   ('like', 0.0025013232099957497)]),
 (23,
  [('hoover', 0.008700038652990256),
   ('corps bride', 0.00732887106352155),
   ('tim burton', 0.006826665264442654),
   ('victor', 0.005929040510379885),
   ('burton', 0.004976142260056772),
   ('j edgar', 0.004797945838306228),
   ('emili', 0.003796442333339004),
   ('nightmar christma', 0.0034958845329940993),
   ('keat', 0.003236875080681154),
   ('fbi', 0.003196416610413886)]),
 (24,
  [('like', 0.0034253115255457062),
   ('film', 0.003258474572781298),
   ('movi', 0.0032153996507010394),
   ('show', 0.0030328871964571424),
   ('disney', 0.0026956007217451353),
   ('charact', 0.002588108475721663),
   ('anim', 0.0025164701147089943),
   ('play', 0.002293765927055132),
   ('seri', 0.0022646963526813573),
   ('see', 0.0022642423353006675)]),
 (25,
  [('london', 0.0035197223181087445),
   ('film', 0.003394970871911595),
   ('like', 0.003292955875178177),
   ('presid', 0.0032162862614131256),
   ('nicol kidman', 0.00304980595800751),
   ('movi', 0.0030454873293242844),
   ('terrorist', 0.002966627501666367),
   ('frankenstein', 0.002835998262017626),
   ('mel brook', 0.0027838546926007217),
   ('invas', 0.0024809129382805466)]),
 (26,
  [('doctor strang', 0.004857682971605819),
   ('strang', 0.003743141586560968),
   ('marvel', 0.0034717701716745777),
   ('movi', 0.0034178403275768824),
   ('like', 0.003092415889813547),
   ('dog', 0.0028017806457595103),
   ('cat', 0.0024044116143817225),
   ('pet', 0.0023252864180071005),
   ('film', 0.002277455245164544),
   ('world', 0.0022750704567086944)]),
 (27,
  [('horror', 0.00473361082947753),
   ('horror movi', 0.00355075371756822),
   ('horror film', 0.0031433665446544898),
   ('son', 0.003106894814739307),
   ('film', 0.0030929245349092423),
   ('like', 0.0028938406637667425),
   ('movi', 0.0028220399474792627),
   ('end', 0.0027751412093911403),
   ('see', 0.002323792689112002),
   ('hous', 0.002248691168513446)]),
 (28,
  [('hugo', 0.011514033003822113),
   ('mulan', 0.00518332869455067),
   ('alic', 0.004949131157387205),
   ('3d', 0.004689314351984262),
   ('scorses', 0.00443361107623598),
   ('film', 0.0034868083206997433),
   ('toy', 0.0032871850390220705),
   ('father', 0.0029506018543388186),
   ('time', 0.002926166463236693),
   ('magic', 0.002899179242699674)]),
 (29,
  [('action', 0.0041294565001336846),
   ('cage', 0.003813169878852763),
   ('play', 0.003679977958055829),
   ('like', 0.0036151406707821284),
   ('movi', 0.0035545238426177286),
   ('film', 0.003440253509303975),
   ('thriller', 0.003122969470771121),
   ('end', 0.002774607297352119),
   ('kill', 0.0026843458101553884),
   ('agent', 0.0026599624374992322)]),
 (30,
  [('shark', 0.009838000402231208),
   ('jaw', 0.006217394591680052),
   ('gatsbi', 0.0052084803966805015),
   ('water', 0.0026654345058311243),
   ('film', 0.0026547111630341384),
   ('spielberg', 0.002521284065743088),
   ('movi', 0.002516501207758189),
   ('like', 0.0021974859831383176),
   ('novel', 0.0021628753960389047),
   ('scene', 0.0021514626840319104)]),
 (31,
  [('sam', 0.02644529285997377),
   ('sean penn', 0.008659321502188148),
   ('father', 0.007634014158227186),
   ('daughter', 0.006740195388434934),
   ('luci', 0.006504072500943755),
   ('child', 0.005962361611248664),
   ('mother', 0.005494215708550707),
   ('penn', 0.00508912738673334),
   ('ernest', 0.005056184764343401),
   ('movi', 0.004442380371351324)]),
 (32,
  [('shark', 0.015424157014806573),
   ('christma', 0.008929597835398522),
   ('jack', 0.0052528923785752505),
   ('herzog', 0.004774732015480693),
   ('jaw', 0.0035328306908396314),
   ('brodi', 0.0034779317309458204),
   ('sequel', 0.003419609177282074),
   ('film', 0.003324761408988194),
   ('water', 0.0032575110143473474),
   ('cave', 0.0031571745532541816)]),
 (33,
  [('anim', 0.008112742314263696),
   ('et', 0.005249838628722552),
   ('disney', 0.004054089271707861),
   ('movi', 0.0034665320792685236),
   ('moana', 0.0031984284317448856),
   ('film', 0.003077306543547521),
   ('like', 0.0030623852719044286),
   ('giant', 0.0028727534542107294),
   ('stori', 0.0026565022326935145),
   ('anim film', 0.002632372347545754)]),
 (34,
  [('music', 0.012239958109355005),
   ('song', 0.009343166930799125),
   ('band', 0.007986316498917784),
   ('sing', 0.00417704024345578),
   ('beatl', 0.0035992460041742187),
   ('like', 0.0035631132788295854),
   ('film', 0.0033306529935544617),
   ('play', 0.003316648064495809),
   ('perform', 0.003236142664462364),
   ('love', 0.0030841927115129726)]),
 (35,
  [('movi', 0.004540633312687469),
   ('like', 0.003581002990737691),
   ('film', 0.0032950921840346273),
   ('play', 0.0028354916781714564),
   ('get', 0.002606895526173135),
   ('charact', 0.002602460673611456),
   ('see', 0.0025363161376517136),
   ('go', 0.0022892698163229246),
   ('scene', 0.002238304030307095),
   ('make', 0.0021940853115992617)]),
 (36,
  [('cartoon', 0.007067907801992622),
   ('roger rabbit', 0.0061473512792339854),
   ('roger', 0.004762159126182217),
   ('paint', 0.004719403039260968),
   ('toon', 0.004621426666226609),
   ('margaret', 0.004510732282304087),
   ('film', 0.004323222581659166),
   ('looney tune', 0.003635096604061792),
   ('art', 0.0034797530353102993),
   ('walter', 0.0034436150457515568)]),
 (37,
  [('anim', 0.011759559542983333),
   ('voic', 0.007206811066817939),
   ('mowgli', 0.005677337626439766),
   ('jungl book', 0.004376937423211289),
   ('star war', 0.004372260827233785),
   ('jungl', 0.004280999687766443),
   ('disney', 0.003922763474366437),
   ('bear', 0.0036740132424885725),
   ('movi', 0.0032900235520207763),
   ('children', 0.003094042795135187)]),
 (38,
  [('comedi', 0.016820833352259902),
   ('funni', 0.016220325603534605),
   ('joke', 0.00969433084445246),
   ('laugh', 0.009649266537346733),
   ('kid', 0.008028833966097677),
   ('humor', 0.006915585482548239),
   ('hilari', 0.006643424840259415),
   ('play', 0.0060413069963584495),
   ('movi', 0.005123885823364468),
   ('like', 0.004409638265855183)]),
 (39,
  [('zombi', 0.008371221164401095),
   ('vampir', 0.005956811565545894),
   ('segment', 0.005792004117479918),
   ('short', 0.0056147281088430945),
   ('like', 0.0041889622612350675),
   ('film', 0.004018342213894351),
   ('horror', 0.0038876481607798986),
   ('stori', 0.0038715729415269287),
   ('found footag', 0.0038118444375286747),
   ('ethan', 0.0034235198247124627)]),
 (40,
  [('minion', 0.005532135459560943),
   ('armageddon', 0.004352296127803583),
   ('movi', 0.004049047125443678),
   ('space', 0.00350736565757296),
   ('parti', 0.003265519491925058),
   ('like', 0.0031372881396036895),
   ('film', 0.0029334941423502002),
   ('asteroid', 0.0027914271153127164),
   ('bruce willi', 0.002652265228489481),
   ('make', 0.002210589438027659)]),
 (41,
  [('mike', 0.00859291323052132),
   ('poker', 0.007272409488467307),
   ('damon', 0.005408041276940969),
   ('matt damon', 0.005297409329594395),
   ('play', 0.00494679798372416),
   ('gambl', 0.004899879752016969),
   ('rounder', 0.004228138097123384),
   ('worm', 0.0041886020371158655),
   ('game', 0.003958199592733736),
   ('film', 0.0036069042276901195)]),
 (42,
  [('alien', 0.0061365824920051335),
   ('stephen king', 0.004645067033260583),
   ('dreamcatch', 0.004123081444101065),
   ('like', 0.0033756473472976627),
   ('scari movi', 0.0033660072012947273),
   ('adam sandler', 0.003356345546405259),
   ('movi', 0.0032977964135144163),
   ('sandler', 0.003044791177742643),
   ('book', 0.002714426476262007),
   ('film', 0.0025821235931530184)]),
 (43,
  [('love', 0.003365859776073881),
   ('film', 0.0033079941886056934),
   ('punish', 0.002897059927612455),
   ('music', 0.0028179654667600373),
   ('like', 0.0027857754799058866),
   ('movi', 0.002615549069909907),
   ('charact', 0.0024540438659849356),
   ('chicago', 0.0024132713626846816),
   ('play', 0.0023474017718287956),
   ('stori', 0.0022395620174848734)]),
 (44,
  [('bill murray', 0.003776567887889286),
   ('film', 0.0036779287186424533),
   ('charact', 0.0035765886602390972),
   ('love', 0.0035482599429864804),
   ('anderson', 0.0035063866222770856),
   ('like', 0.0033845737649334452),
   ('play', 0.003383495565254655),
   ('relationship', 0.003361367096340624),
   ('we anderson', 0.003293273564795644),
   ('movi', 0.0030615784775169406)]),
 (45,
  [('film', 0.005190267666006418),
   ('like', 0.00438199412081695),
   ('murder', 0.0037738603468463862),
   ('end', 0.0036005556922865285),
   ('kill', 0.003481591942983189),
   ('get', 0.00332876171660867),
   ('movi', 0.003308972327989596),
   ('stori', 0.0032507516750482995),
   ('case', 0.0030220934533255303),
   ('david', 0.0029019645934832664)]),
 (46,
  [('frank', 0.006491357280565783),
   ('hellrais', 0.005020734646198042),
   ('chucki', 0.0046174714076154255),
   ('cenobit', 0.00454177897753315),
   ('hell', 0.003862510298871546),
   ('horror', 0.003757973610947276),
   ('film', 0.0034922914188203207),
   ('julia', 0.003277301670187491),
   ('pinhead', 0.003147337342380718),
   ('kirsti', 0.002993732568814136)]),
 (47,
  [('sixth sens', 0.005795117022007233),
   ('bruce willi', 0.0046240751315090615),
   ('cole', 0.0033716990859942236),
   ('spotlight', 0.003320817500466491),
   ('haley joel', 0.002870377418492879),
   ('joel osment', 0.002840522406912891),
   ('film', 0.002703139088254087),
   ('movi', 0.0026517978682914977),
   ('end', 0.002579220858858355),
   ('haley joel osment', 0.002576655929948954)]),
 (48,
  [('high school', 0.0051771252555675145),
   ('song', 0.004182071382141232),
   ('movi', 0.003523390459709779),
   ('music', 0.0033934960251901566),
   ('like', 0.003242199863082102),
   ('school music', 0.0030020791260505597),
   ('sing', 0.0027715097436366207),
   ('high school music', 0.002743446957833265),
   ('film', 0.002545499789388427),
   ('greas', 0.0024775606169947745)]),
 (49,
  [('jew', 0.007787153651280862),
   ('schindler', 0.006527078197464334),
   ('schindler list', 0.005892508038749344),
   ('spielberg', 0.005709840287391958),
   ('holocaust', 0.005560139415929855),
   ('nazi', 0.004325405363434979),
   ('jewish', 0.0037718076365723017),
   ('film', 0.003590166085599054),
   ('black white', 0.0031822757199269623),
   ('steven spielberg', 0.0026950451950057542)])]

In [202]:
# Specify number of top words:
#num_top_words = 10

# Show top words from each topic:
#for k in range(K):
#    print("topic " + str(k) + ":")
#    topic_top_words = ldamodel.show_topic(k)
#    for top_word in topic_top_words:
#        print((top_word[0],format(top_word[1],".4f")))
#    print("\n")

In [218]:
# Obtain topic distribution for each movie review and every movie content:
#topic_probs = []
#for document in corpus:
#    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(document, minimum_probability=0)]))
#topic_probs_array = np.asarray(topic_probs)

In [23]:
# Obtain topic distribution for each movie review and every movie content:
movie_topics =[]
for document in corpus:
    topic_probs = {i:0 for i in range(K)}
    for topic in ldamodel.get_document_topics(document, minimum_probability=0):
        topic_probs[topic[0]] = topic[1]
    movie_topics.append(topic_probs)

Application


In [252]:
data.loc[data['title'].str.contains('Titanic')]


Out[252]:
title year reviews_mono contents_mono reviews_bi contents_bi reviews_tri contents_tri ngrams
157 Titanic 1997 {'convers': 38, 'turn': 145, 'ill': 83, 'menti... {'titan': 95, '1997': 13, 'american': 3, 'epic... {'die hard': 5, 'first world': 2, 'russel crow... {'second film': 4, 'film depict': 6, 'show fil... {'lot peopl say': 3, 'didnt know expect': 2, '... {'special effect film': 2, 'box offic mojo': 3... {'convers': 45, 'turn': 148, 'ill': 83, 'menti...

In [253]:
# convert the query to model space
# Moana: animation, adventure
#query = 168
#query = 243
query = 157
vec_bow = corpus[query]
vec_lda = ldamodel[vec_bow]

In [237]:
# build the index 
index = similarities.MatrixSimilarity(ldamodel[corpus])
index.save('similarity.index')
index = similarities.MatrixSimilarity.load('similarity.index')

In [254]:
# get similarities between the query and all index documents
sims = index[vec_lda]
# sort 
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [257]:
sims[0:10]


Out[257]:
[(157, 1.0),
 (720, 1.0),
 (1448, 0.99970907),
 (1449, 0.99965554),
 (326, 0.99958664),
 (81, 0.99954736),
 (327, 0.9995186),
 (1212, 0.9043089),
 (1547, 0.90419513),
 (1083, 0.86937153)]

In [258]:
[data.loc[i[0], 'title'] for i in sims[0:20]]


Out[258]:
['Titanic',
 'No Country for Old Men',
 'The Double',
 'The Double',
 'The Double',
 'Cloud Atlas',
 'The Double',
 'Seeking a Friend for the End of the World',
 'Chevolution',
 'Valhalla Rising',
 'Outrage',
 'Kurt & Courtney',
 'Aziz Ansari: Buried Alive',
 "Women He's Undressed",
 'We Need to Talk About Kevin',
 'Soaked in Bleach',
 'Amu',
 'Waking Life',
 'Radio Free Albemuth',
 'Whitey: United States of America v. James J. Bulger']