In [1]:
#!pip install --ignore-installed --upgrade pandas

In [12]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from collections import defaultdict, Counter
import sklearn.metrics
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.ldamulticore import LdaMulticore
import nltk
import pyLDAvis
import pyLDAvis.gensim
#nltk.download()

In [3]:
# Read in wikipedia and imdb data set from .pickle file:
data = pd.read_pickle('clean_complete_ngram.pickle')

In [4]:
# Show the data:
data.head(5)


Out[4]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi
0 Dead Awake 2016 investig death twin sister sleep social worker... movi new havent seen name 3 4 last yeari famil... dead awak 2016 american supernatur psycholog h... {('good', 'horror', 'film'): 1, ('good', 'job'... {('refer', 'extern', 'link'): 1, ('extern', 'l... {('dream', 'movi'): 1, ('right', 'right'): 1, ... {('world', 'premier'): 1, ('psycholog', 'horro...
10 A Good American 2015 documentari reveal truth nsa cryptologist inno... documentari show situat idea know bit heard re... good american 2015 austrian documentari film c... {} {('new', 'york', 'time'): 1, ('refer', 'extern... {('make', 'sens'): 1, ('compel', 'film'): 1, (... {('produc', 'direct'): 1, ('score', 'film'): 1...
11 Hard Tide 2015 drug dealer who emul father success crimin car... watch rot last night tempt dont bother script ... hard tide 2015 british crime drama written dir... {('doesnt', 'take', 'long'): 1, ('nine', 'year... {('gave', 'film', 'posit'): 1, ('recept', 'rot... {('want', 'good'): 1, ('watch', 'film'): 1, ('... {('total', 'film'): 1, ('hard', 'time'): 1, ('...
13 Carrie Pilby 2016 social awkward 19yearold geniu make big plan s... excit see film toronto filmfest last week enjo... carri pilbi 2016 american comedi film direct s... {('excit', 'see', 'film'): 1, ('toronto', 'fil... {('acquir', 'distribut', 'right'): 1, ('right'... {('good', 'role'): 1, ('watch', 'film'): 1, ('... {('film', 'star'): 1, ('refer', 'extern'): 1, ...
14 A Dark Song 2016 griev death son woman hire occult expert lead ... writer felt job review mere whine prattl happe... dark song 2016 irish independ horror film writ... {('good', 'horror', 'film'): 1, ('act', 'prett... {('film', 'festiv', 'releas'): 1, ('film', 're... {('vast', 'major'): 1, ('fantast', 'film'): 1,... {('end', 'definit'): 1, ('festiv', 'releas'): ...

In [5]:
# Select title, year, imdb reviews, and wikipedia content columns:
movie_data = data[["title", "year", "reviews", "content"]]

# Create a new type variable which says whether a review or content, 
# and consider each review and content its own document.
movie_data = movie_data.melt(id_vars = ["title", "year"], value_vars=["reviews", "content"])

In [6]:
# Show the new data:
movie_data.head(5)


Out[6]:
title year variable value
0 Dead Awake 2016 reviews movi new havent seen name 3 4 last yeari famil...
1 A Good American 2015 reviews documentari show situat idea know bit heard re...
2 Hard Tide 2015 reviews watch rot last night tempt dont bother script ...
3 Carrie Pilby 2016 reviews excit see film toronto filmfest last week enjo...
4 A Dark Song 2016 reviews writer felt job review mere whine prattl happe...

In [7]:
# Collect the documents as a list of strings:
docs = movie_data[["value"]].astype(str)['value'].tolist()

In [8]:
# remove common words and tokenize
stop_words = set(stopwords.words('english'))
numbers = ""
for i in range(0,999):
    numbers += str(i) + " "
letters = set('a b c d e f g h i j k l m n o p q r s t u v w x y z'.split())
words_to_remove = stop_words.union(numbers).union(letters)
texts = [[word for word in document.lower().split() if word not in words_to_remove]
         for document in docs]

In [9]:
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1        
texts = [[token for token in text if frequency[token] > 9] for text in texts]

In [ ]:
# Create a dictionary indexing the unique terms:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('wiki.dict')

In [ ]:
# Create a sparsely formatted corpus:
corpus = [dictionary.doc2bow(text) for text in texts]

# Store to disk, for later use:
corpora.MmCorpus.serialize('wiki.mm', corpus)

In [ ]:
# Specify a number of topics:
K = 100

In [ ]:
# Fit the LDA model (100 topics, 10 passes takes about 1/2 hour, 3 topics 3 passes takes 5 min):
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=K, id2word = dictionary, passes=20, alpha=.1/K, eta=.1/K)

# Save the model object for visualization:
ldamodel.save('wiki.model')

In [ ]:
# Collect phi matrix of topic word proportions, theta matrix of document topic proportions, f_v of word frequencies
# p_v of term probabilities, V_n of unique terms in each document, W_v of document lengths, f_k of number of words 
# used in each topic, and p_k probabilities of each topic occurring, and bayes probabilities of a topic given a word,
# and a vocab list of terms:
phi_kv = np.zeros((K, len(dictionary)))
theta_nk = np.zeros((len(corpus), K))
for k in range(0,K):
    phi_kv[k,:] = [word_prob[1] for word_prob in lda_mult.get_topic_terms(k, len(dictionary))]
for n in range(0,len(corpus)):
    theta_nk[n,:] = np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[n], minimum_probability=0)])
counts = Counter()
for text in texts:
    for word in text:
        counts[word] += 1
f_v = np.array([counts[dictionary[word_ind]] for word_ind in range(len(dictionary))])
p_v = f_v/sum(f_v)
V_n = np.array([len(doc) for doc in corpus])
W_n = np.array([sum(word[1] for word in doc) for doc in corpus]) 
f_k = [sum(theta_nk[:,k]*W_n) for k in range(K)]
p_k = f_k/sum(f_k)
bayes_kv = np.zeros((K, len(dictionary)))
for k in range(K):
    bayes_kv[k,:] = phi_kv[k,:]*p_k[k]/p_v
vocab = [dictionary[i] for i in range(len(dictionary))]

In [ ]:
data = {'topic_term_dists': phi_kv, 
            'doc_topic_dists': theta_nk,
            'doc_lengths': W_n,
            'vocab': vocab,
            'term_frequency': f_v}

In [ ]:
movies_vis_data = pyLDAvis.prepare(**data)

In [ ]:
pyLDAvis.display(movies_vis_data)

In [ ]:
# Specify number of top words:
num_top_words = 10

# Show top words from each topic:
for k in range(K):
    print("topic " + str(k) + ":")
    topic_top_words = ldamodel.show_topic(k)
    for top_word in topic_top_words:
        print((top_word[0],format(top_word[1],".2f")))
    print("\n")

In [ ]:
# Obtain topic distribution for each movie review and every movie content:
topic_probs = []
for document in corpus:
    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(document, minimum_probability=0)]))
topic_probs_array = np.asarray(topic_probs)
topic_probs_array

In [ ]:
# Compute pairwise negative hellinger distance matrix for documents:
#document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "hellinger")

# Compute pairwise cosine similarity matrix for documents:
document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "cosine")

In [ ]:
# Return the 10 documents most similar to a specified movie:
movie_title = "The Search for Santa Paws"
titanic_index = movie_data.loc[movie_data['title'] == movie_title].index[0:2][0]
titanic_similarities = document_similarities[titanic_index, :]
closest_movies = sorted(range(len(titanic_similarities)), key=lambda k: -titanic_similarities[k])
movie_data['title'].loc[closest_movies]

In [ ]:
# Compute matrix of topic distribution similarities:
#topic_similarity_matrix = np.zeros((len(corpus), len(corpus)))
#for i in range(5):
#    for j in range(5):
#        topic_similarity_matrix[i,j] = gensim.matutils.cossim([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[i], minimum_probability=0)], [topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[j], minimum_probability=0)])

In [ ]:


In [ ]:
# Obtain topic distribution for each movie review and every movie content:
#topic_probs = []
#for document in corpus:
#    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in lda_mult.get_document_topics(document, minimum_probability=0)]))
#topic_probs_array = np.asarray(topic_probs)
#topic_probs_array

In [ ]:
# TO DO: FIND TOP WORDS BY BAYES RULE:

In [ ]:
# Compute tf-idf for each document:
tf_idf = gensim.models.TfidfModel(corpus)

# Compute similarity for each document pair:
document_similarities = gensim.similarities.Similarity('doc_sims',tf_idf[corpus],
                                      num_features=len(dictionary))

# Find the similarity of movies to a query movie:
query_doc = [w.lower() for w in texts[157]]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)
sims[query_doc_tf_idf]

In [ ]:
# Fit the LDA model with multi core (3 topics 3 iterations 3 passes, 4 min):
#lda_mult = LdaMulticore(corpus, num_topics=K, id2word=dictionary, iterations = 3, passes = 3, alpha=.1/K, eta=.1/K)

In [ ]:
# Store the corpus in Blei's LDA-C format:
#corpora.BleiCorpus.serialize('wiki.lda-c', corpus)

# Read in the corpus in Blei's LDA-C format:
#corpus_ldac = corpora.BleiCorpus('wiki.lda-c')

In [ ]:
# format time slices:
#time_slices = {i: 0 for i in range(1939,2018)}
#for year in movie_data['year']:
#    time_slices[year] += 1
#time_slices = [val for val in time_slices.values()]
#time_slices

In [ ]:
# Run dynamic lda on the wikipedia and imdb corpus:
#model = DtmModel('dtm-win64.exe', corpus_ldac, time_slices, num_topics=3, id2word=dictionary)