notebook.community

Edit and run



In [1]:

    
#!pip install --ignore-installed --upgrade pandas



In [12]:

    
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from collections import defaultdict, Counter
import sklearn.metrics
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.ldamulticore import LdaMulticore
import nltk
import pyLDAvis
import pyLDAvis.gensim
#nltk.download()



In [3]:

    
# Read in wikipedia and imdb data set from .pickle file:
data = pd.read_pickle('clean_complete_ngram.pickle')



In [4]:

    
# Show the data:
data.head(5)









    Out[4]:







  
    
      
      title
      year
      synopsis
      reviews
      content
      reviews_tri
      contents_tri
      reviews_bi
      contents_bi
    
  
  
    
      0
      Dead Awake
      2016
      investig death twin sister sleep social worker...
      movi new havent seen name 3 4 last yeari famil...
      dead awak 2016 american supernatur psycholog h...
      {('good', 'horror', 'film'): 1, ('good', 'job'...
      {('refer', 'extern', 'link'): 1, ('extern', 'l...
      {('dream', 'movi'): 1, ('right', 'right'): 1, ...
      {('world', 'premier'): 1, ('psycholog', 'horro...
    
    
      10
      A Good American
      2015
      documentari reveal truth nsa cryptologist inno...
      documentari show situat idea know bit heard re...
      good american 2015 austrian documentari film c...
      {}
      {('new', 'york', 'time'): 1, ('refer', 'extern...
      {('make', 'sens'): 1, ('compel', 'film'): 1, (...
      {('produc', 'direct'): 1, ('score', 'film'): 1...
    
    
      11
      Hard Tide
      2015
      drug dealer who emul father success crimin car...
      watch rot last night tempt dont bother script ...
      hard tide 2015 british crime drama written dir...
      {('doesnt', 'take', 'long'): 1, ('nine', 'year...
      {('gave', 'film', 'posit'): 1, ('recept', 'rot...
      {('want', 'good'): 1, ('watch', 'film'): 1, ('...
      {('total', 'film'): 1, ('hard', 'time'): 1, ('...
    
    
      13
      Carrie Pilby
      2016
      social awkward 19yearold geniu make big plan s...
      excit see film toronto filmfest last week enjo...
      carri pilbi 2016 american comedi film direct s...
      {('excit', 'see', 'film'): 1, ('toronto', 'fil...
      {('acquir', 'distribut', 'right'): 1, ('right'...
      {('good', 'role'): 1, ('watch', 'film'): 1, ('...
      {('film', 'star'): 1, ('refer', 'extern'): 1, ...
    
    
      14
      A Dark Song
      2016
      griev death son woman hire occult expert lead ...
      writer felt job review mere whine prattl happe...
      dark song 2016 irish independ horror film writ...
      {('good', 'horror', 'film'): 1, ('act', 'prett...
      {('film', 'festiv', 'releas'): 1, ('film', 're...
      {('vast', 'major'): 1, ('fantast', 'film'): 1,...
      {('end', 'definit'): 1, ('festiv', 'releas'): ...



In [5]:

    
# Select title, year, imdb reviews, and wikipedia content columns:
movie_data = data[["title", "year", "reviews", "content"]]

# Create a new type variable which says whether a review or content, 
# and consider each review and content its own document.
movie_data = movie_data.melt(id_vars = ["title", "year"], value_vars=["reviews", "content"])



In [6]:

    
# Show the new data:
movie_data.head(5)









    Out[6]:







  
    
      
      title
      year
      variable
      value
    
  
  
    
      0
      Dead Awake
      2016
      reviews
      movi new havent seen name 3 4 last yeari famil...
    
    
      1
      A Good American
      2015
      reviews
      documentari show situat idea know bit heard re...
    
    
      2
      Hard Tide
      2015
      reviews
      watch rot last night tempt dont bother script ...
    
    
      3
      Carrie Pilby
      2016
      reviews
      excit see film toronto filmfest last week enjo...
    
    
      4
      A Dark Song
      2016
      reviews
      writer felt job review mere whine prattl happe...



In [7]:

    
# Collect the documents as a list of strings:
docs = movie_data[["value"]].astype(str)['value'].tolist()



In [8]:

    
# remove common words and tokenize
stop_words = set(stopwords.words('english'))
numbers = ""
for i in range(0,999):
    numbers += str(i) + " "
letters = set('a b c d e f g h i j k l m n o p q r s t u v w x y z'.split())
words_to_remove = stop_words.union(numbers).union(letters)
texts = [[word for word in document.lower().split() if word not in words_to_remove]
         for document in docs]



In [9]:

    
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1        
texts = [[token for token in text if frequency[token] > 9] for text in texts]



In [ ]:

    
# Create a dictionary indexing the unique terms:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('wiki.dict')



In [ ]:

    
# Create a sparsely formatted corpus:
corpus = [dictionary.doc2bow(text) for text in texts]

# Store to disk, for later use:
corpora.MmCorpus.serialize('wiki.mm', corpus)



In [ ]:

    
# Specify a number of topics:
K = 100



In [ ]:

    
# Fit the LDA model (100 topics, 10 passes takes about 1/2 hour, 3 topics 3 passes takes 5 min):
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=K, id2word = dictionary, passes=20, alpha=.1/K, eta=.1/K)

# Save the model object for visualization:
ldamodel.save('wiki.model')



In [ ]:

    
# Collect phi matrix of topic word proportions, theta matrix of document topic proportions, f_v of word frequencies
# p_v of term probabilities, V_n of unique terms in each document, W_v of document lengths, f_k of number of words 
# used in each topic, and p_k probabilities of each topic occurring, and bayes probabilities of a topic given a word,
# and a vocab list of terms:
phi_kv = np.zeros((K, len(dictionary)))
theta_nk = np.zeros((len(corpus), K))
for k in range(0,K):
    phi_kv[k,:] = [word_prob[1] for word_prob in lda_mult.get_topic_terms(k, len(dictionary))]
for n in range(0,len(corpus)):
    theta_nk[n,:] = np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[n], minimum_probability=0)])
counts = Counter()
for text in texts:
    for word in text:
        counts[word] += 1
f_v = np.array([counts[dictionary[word_ind]] for word_ind in range(len(dictionary))])
p_v = f_v/sum(f_v)
V_n = np.array([len(doc) for doc in corpus])
W_n = np.array([sum(word[1] for word in doc) for doc in corpus]) 
f_k = [sum(theta_nk[:,k]*W_n) for k in range(K)]
p_k = f_k/sum(f_k)
bayes_kv = np.zeros((K, len(dictionary)))
for k in range(K):
    bayes_kv[k,:] = phi_kv[k,:]*p_k[k]/p_v
vocab = [dictionary[i] for i in range(len(dictionary))]



In [ ]:

    
data = {'topic_term_dists': phi_kv, 
            'doc_topic_dists': theta_nk,
            'doc_lengths': W_n,
            'vocab': vocab,
            'term_frequency': f_v}



In [ ]:

    
movies_vis_data = pyLDAvis.prepare(**data)



In [ ]:

    
pyLDAvis.display(movies_vis_data)



In [ ]:

    
# Specify number of top words:
num_top_words = 10

# Show top words from each topic:
for k in range(K):
    print("topic " + str(k) + ":")
    topic_top_words = ldamodel.show_topic(k)
    for top_word in topic_top_words:
        print((top_word[0],format(top_word[1],".2f")))
    print("\n")



In [ ]:

    
# Obtain topic distribution for each movie review and every movie content:
topic_probs = []
for document in corpus:
    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(document, minimum_probability=0)]))
topic_probs_array = np.asarray(topic_probs)
topic_probs_array



In [ ]:

    
# Compute pairwise negative hellinger distance matrix for documents:
#document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "hellinger")

# Compute pairwise cosine similarity matrix for documents:
document_similarities = calc_pairwise_sims_matrix(topic_probs_array, "cosine")



In [ ]:

    
# Return the 10 documents most similar to a specified movie:
movie_title = "The Search for Santa Paws"
titanic_index = movie_data.loc[movie_data['title'] == movie_title].index[0:2][0]
titanic_similarities = document_similarities[titanic_index, :]
closest_movies = sorted(range(len(titanic_similarities)), key=lambda k: -titanic_similarities[k])
movie_data['title'].loc[closest_movies]



In [ ]:

    
# Compute matrix of topic distribution similarities:
#topic_similarity_matrix = np.zeros((len(corpus), len(corpus)))
#for i in range(5):
#    for j in range(5):
#        topic_similarity_matrix[i,j] = gensim.matutils.cossim([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[i], minimum_probability=0)], [topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(corpus[j], minimum_probability=0)])



In [ ]:



In [ ]:

    
# Obtain topic distribution for each movie review and every movie content:
#topic_probs = []
#for document in corpus:
#    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in lda_mult.get_document_topics(document, minimum_probability=0)]))
#topic_probs_array = np.asarray(topic_probs)
#topic_probs_array



In [ ]:

    
# TO DO: FIND TOP WORDS BY BAYES RULE:



In [ ]:

    
# Compute tf-idf for each document:
tf_idf = gensim.models.TfidfModel(corpus)

# Compute similarity for each document pair:
document_similarities = gensim.similarities.Similarity('doc_sims',tf_idf[corpus],
                                      num_features=len(dictionary))

# Find the similarity of movies to a query movie:
query_doc = [w.lower() for w in texts[157]]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)
sims[query_doc_tf_idf]



In [ ]:

    
# Fit the LDA model with multi core (3 topics 3 iterations 3 passes, 4 min):
#lda_mult = LdaMulticore(corpus, num_topics=K, id2word=dictionary, iterations = 3, passes = 3, alpha=.1/K, eta=.1/K)



In [ ]:

    
# Store the corpus in Blei's LDA-C format:
#corpora.BleiCorpus.serialize('wiki.lda-c', corpus)

# Read in the corpus in Blei's LDA-C format:
#corpus_ldac = corpora.BleiCorpus('wiki.lda-c')



In [ ]:

    
# format time slices:
#time_slices = {i: 0 for i in range(1939,2018)}
#for year in movie_data['year']:
#    time_slices[year] += 1
#time_slices = [val for val in time_slices.values()]
#time_slices



In [ ]:

    
# Run dynamic lda on the wikipedia and imdb corpus:
#model = DtmModel('dtm-win64.exe', corpus_ldac, time_slices, num_topics=3, id2word=dictionary)

	title	year	synopsis	reviews	content	reviews_tri	contents_tri	reviews_bi	contents_bi
0	Dead Awake	2016	investig death twin sister sleep social worker...	movi new havent seen name 3 4 last yeari famil...	dead awak 2016 american supernatur psycholog h...	{('good', 'horror', 'film'): 1, ('good', 'job'...	{('refer', 'extern', 'link'): 1, ('extern', 'l...	{('dream', 'movi'): 1, ('right', 'right'): 1, ...	{('world', 'premier'): 1, ('psycholog', 'horro...
10	A Good American	2015	documentari reveal truth nsa cryptologist inno...	documentari show situat idea know bit heard re...	good american 2015 austrian documentari film c...	{}	{('new', 'york', 'time'): 1, ('refer', 'extern...	{('make', 'sens'): 1, ('compel', 'film'): 1, (...	{('produc', 'direct'): 1, ('score', 'film'): 1...
11	Hard Tide	2015	drug dealer who emul father success crimin car...	watch rot last night tempt dont bother script ...	hard tide 2015 british crime drama written dir...	{('doesnt', 'take', 'long'): 1, ('nine', 'year...	{('gave', 'film', 'posit'): 1, ('recept', 'rot...	{('want', 'good'): 1, ('watch', 'film'): 1, ('...	{('total', 'film'): 1, ('hard', 'time'): 1, ('...
13	Carrie Pilby	2016	social awkward 19yearold geniu make big plan s...	excit see film toronto filmfest last week enjo...	carri pilbi 2016 american comedi film direct s...	{('excit', 'see', 'film'): 1, ('toronto', 'fil...	{('acquir', 'distribut', 'right'): 1, ('right'...	{('good', 'role'): 1, ('watch', 'film'): 1, ('...	{('film', 'star'): 1, ('refer', 'extern'): 1, ...
14	A Dark Song	2016	griev death son woman hire occult expert lead ...	writer felt job review mere whine prattl happe...	dark song 2016 irish independ horror film writ...	{('good', 'horror', 'film'): 1, ('act', 'prett...	{('film', 'festiv', 'releas'): 1, ('film', 're...	{('vast', 'major'): 1, ('fantast', 'film'): 1,...	{('end', 'definit'): 1, ('festiv', 'releas'): ...

	title	year	variable	value
0	Dead Awake	2016	reviews	movi new havent seen name 3 4 last yeari famil...
1	A Good American	2015	reviews	documentari show situat idea know bit heard re...
2	Hard Tide	2015	reviews	watch rot last night tempt dont bother script ...
3	Carrie Pilby	2016	reviews	excit see film toronto filmfest last week enjo...
4	A Dark Song	2016	reviews	writer felt job review mere whine prattl happe...