In [1]:
#!pip install --ignore-installed --upgrade pandas
In [12]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from collections import defaultdict, Counter
import sklearn.metrics
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.ldamulticore import LdaMulticore
import nltk
import pyLDAvis
import pyLDAvis.gensim
#nltk.download()
In [3]:
# Read in wikipedia and imdb data set from .pickle file:
data = pd.read_pickle('clean_complete_ngram.pickle')
In [4]:
# Show the data:
data.head(5)
Out[4]:
In [5]:
# Select title, year, imdb reviews, and wikipedia content columns:
movie_data = data[["title", "year", "reviews", "content"]]
# Create a new type variable which says whether a review or content,
# and consider each review and content its own document.
movie_data = movie_data.melt(id_vars = ["title", "year"], value_vars=["reviews", "content"])
In [6]:
# Show the new data:
movie_data.head(5)
Out[6]:
In [7]:
# Collect the documents as a list of strings:
docs = movie_data[["value"]].astype(str)['value'].tolist()
In [8]:
# remove common words and tokenize
stop_words = set(stopwords.words('english'))
numbers = ""
for i in range(0,999):
numbers += str(i) + " "
letters = set('a b c d e f g h i j k l m n o p q r s t u v w x y z'.split())
words_to_remove = stop_words.union(numbers).union(letters)
texts = [[word for word in document.lower().split() if word not in words_to_remove]
for document in docs]
In [9]:
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 9] for text in texts]
In [ ]:
# Create a dictionary indexing the unique terms:
dictionary = corpora.Dictionary(texts)
# store the dictionary, for future reference
dictionary.save('wiki.dict')
In [ ]:
# Create a sparsely formatted corpus:
corpus = [dictionary.doc2bow(text) for text in texts]
# Store to disk, for later use:
corpora.MmCorpus.serialize('wiki.mm', corpus)
In [ ]:
# Specify a number of topics:
K = 100
In [ ]:
# Fit the LDA model (100 topics, 10 passes takes about 1/2 hour, 3 topics 3 passes takes 5 min):
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=K, id2word = dictionary, passes=20, alpha=.1/K, eta=.1/K)
# Save the model object for visualization:
ldamodel.save('wiki.model')
In [ ]:
# Collect phi matrix of topic word proportions, theta matrix of document topic proportions, f_v of word frequencies
# p_v of term probabilities, V_n of unique terms in each document, W_v of document lengths, f_k of number of words
# used in each topic, and p_k probabilities of each topic occurring, and bayes probabilities of a topic given a word,
# and a vocab list of terms:
phi_kv = np.zeros((K, len(dictionary)))
theta_nk = np.zeros((len(corpus), K))
for k in range(0,K):
phi_kv[k,:] = [word_prob[1] for word_prob in lda_mult.get_topic_terms(k, len(dictionary))]
for n in range(0,len(corpus)):
theta_nk[n,:] = np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[n], minimum_probability=0)])
counts = Counter()
for text in texts:
for word in text:
counts[word] += 1
f_v = np.array([counts[dictionary[word_ind]] for word_ind in range(len(dictionary))])
p_v = f_v/sum(f_v)
V_n = np.array([len(doc) for doc in corpus])
W_n = np.array([sum(word[1] for word in doc) for doc in corpus])
f_k = [sum(theta_nk[:,k]*W_n) for k in range(K)]
p_k = f_k/sum(f_k)
bayes_kv = np.zeros((K, len(dictionary)))
for k in range(K):
bayes_kv[k,:] = phi_kv[k,:]*p_k[k]/p_v
vocab = [dictionary[i] for i in range(len(dictionary))]
In [ ]:
data = {'topic_term_dists': phi_kv,
'doc_topic_dists': theta_nk,
'doc_lengths': W_n,
'vocab': vocab,
'term_frequency': f_v}
In [ ]:
movies_vis_data = pyLDAvis.prepare(**data)
In [ ]:
pyLDAvis.display(movies_vis_data)
In [ ]:
# Specify number of top words:
num_top_words = 10
# Show top words from each topic:
for k in range(K):
print("topic " + str(k) + ":")
topic_top_words = ldamodel.show_topic(k)
for top_word in topic_top_words:
print((top_word[0],format(top_word[1],".2f")))
print("\n")
In [ ]:
# Obtain topic distribution for each movie review and every movie content:
topic_probs = []
for document in corpus:
topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(document, minimum_probability=0)]))
topic_probs_array = np.asarray(topic_probs)
topic_probs_array
In [ ]:
# Compute pairwise negative hellinger distance matrix for documents:
#document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "hellinger")
# Compute pairwise cosine similarity matrix for documents:
document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "cosine")
In [ ]:
# Return the 10 documents most similar to a specified movie:
movie_title = "The Search for Santa Paws"
titanic_index = movie_data.loc[movie_data['title'] == movie_title].index[0:2][0]
titanic_similarities = document_similarities[titanic_index, :]
closest_movies = sorted(range(len(titanic_similarities)), key=lambda k: -titanic_similarities[k])
movie_data['title'].loc[closest_movies]
In [ ]:
# Compute matrix of topic distribution similarities:
#topic_similarity_matrix = np.zeros((len(corpus), len(corpus)))
#for i in range(5):
# for j in range(5):
# topic_similarity_matrix[i,j] = gensim.matutils.cossim([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[i], minimum_probability=0)], [topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[j], minimum_probability=0)])
In [ ]:
In [ ]:
# Obtain topic distribution for each movie review and every movie content:
#topic_probs = []
#for document in corpus:
# topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in lda_mult.get_document_topics(document, minimum_probability=0)]))
#topic_probs_array = np.asarray(topic_probs)
#topic_probs_array
In [ ]:
# TO DO: FIND TOP WORDS BY BAYES RULE:
In [ ]:
# Compute tf-idf for each document:
tf_idf = gensim.models.TfidfModel(corpus)
# Compute similarity for each document pair:
document_similarities = gensim.similarities.Similarity('doc_sims',tf_idf[corpus],
num_features=len(dictionary))
# Find the similarity of movies to a query movie:
query_doc = [w.lower() for w in texts[157]]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)
sims[query_doc_tf_idf]
In [ ]:
# Fit the LDA model with multi core (3 topics 3 iterations 3 passes, 4 min):
#lda_mult = LdaMulticore(corpus, num_topics=K, id2word=dictionary, iterations = 3, passes = 3, alpha=.1/K, eta=.1/K)
In [ ]:
# Store the corpus in Blei's LDA-C format:
#corpora.BleiCorpus.serialize('wiki.lda-c', corpus)
# Read in the corpus in Blei's LDA-C format:
#corpus_ldac = corpora.BleiCorpus('wiki.lda-c')
In [ ]:
# format time slices:
#time_slices = {i: 0 for i in range(1939,2018)}
#for year in movie_data['year']:
# time_slices[year] += 1
#time_slices = [val for val in time_slices.values()]
#time_slices
In [ ]:
# Run dynamic lda on the wikipedia and imdb corpus:
#model = DtmModel('dtm-win64.exe', corpus_ldac, time_slices, num_topics=3, id2word=dictionary)