In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn.datasets import load_files
corpus = load_files("../data/")
doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"
In [3]:
from helpers.tokenizer import TextWrangler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(corpus.data)
tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(corpus.data)
In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics,
learning_decay=0.5, learning_offset=1.,
random_state=23)
lsa = TruncatedSVD(n_components=n_topics, random_state=23)
nmf = NMF(n_components=n_topics, solver="mu", beta_loss="kullback-leibler", alpha=0.1, random_state=23)
In [5]:
lda_params = {"lda__learning_decay": [0.5, 0.7, 0.9],
"lda__learning_offset": [1., 5., 10.]}
In [6]:
from sklearn.pipeline import Pipeline
lda_pipe = Pipeline([
("bow", bow_stem),
("lda", lda)
])
lsa_pipe = Pipeline([
("tfidf", tfidf_stem),
("lsa", lsa)
])
nmf_pipe = Pipeline([
("tfidf", tfidf_stem),
("nmf", nmf)
])
In [7]:
from sklearn.model_selection import GridSearchCV
lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)
#lda_model.fit(corpus.data)
#lda_model.best_params_
In [8]:
lda_pipe.fit(corpus.data)
nmf_pipe.fit(corpus.data)
lsa_pipe.fit(corpus.data)
Out[8]:
In [9]:
print("LDA")
print("Log Likelihood:", lda_pipe.score(corpus.data))
In [10]:
def df_topic_model(vectorizer, model, n_words=20):
keywords = np.array(vectorizer.get_feature_names())
topic_keywords = []
for topic_weights in model.components_:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
topic_keywords.append(keywords.take(top_keyword_locs))
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
return df_topic_keywords
In [11]:
print("LDA")
df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=15)
Out[11]:
In [12]:
print("LSA")
df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=15)
Out[12]:
In [13]:
print("NMF")
df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=15)
Out[13]:
In [14]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()
prepare(lda_pipe.named_steps.lda, X_bow_stem, bow_stem, mds="tsne")
Out[14]:
In [15]:
prepare(nmf_pipe.named_steps.nmf, X_tfidf_stem, tfidf_stem, mds="tsne")
Out[15]:
Topic models derived from different approaches look dissimilar. Top word distribution of NMF appears most meaningful, mostly because its topics doesn't share same words (due to NMF algorithm). LSA topic model is better interpretable than its LDA counterpart. Nonetheless, topics from both are hard to distinguish and doesn't make much sense. Therefore I'll go with the NMF topic model for the assginment to novel collections step.
In [16]:
df_topic_word_lda = df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=10)
df_topic_word_lsa = df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=10)
df_topic_word_nmf = df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=10)
In [17]:
def jaccard_index(list1, list2):
s1 = set(list1)
s2 = set(list2)
jaccard_index = len(s1.intersection(s2)) / len(s1.union(s2))
return jaccard_index
In [18]:
sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf = {}, {}, {}
assert df_topic_word_lda.shape[0] == df_topic_word_lsa.shape[0] == df_topic_word_nmf.shape[0], "n_topics mismatch"
for ix, row in df_topic_word_lda.iterrows():
l1 = df_topic_word_lda.loc[ix, :].values.tolist()
l2 = df_topic_word_lsa.loc[ix, :].values.tolist()
l3 = df_topic_word_nmf.loc[ix, :].values.tolist()
sims_lda_lsa[ix] = jaccard_index(l1, l2)
sims_lda_nmf[ix] = jaccard_index(l1, l3)
sims_lsa_nmf[ix] = jaccard_index(l2, l3)
df_jaccard_sims = pd.DataFrame([sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf])
df_jaccard_sims.index = ["LDA vs LSA", "LDA vs NMF", "LSA vs NMF"]
df_jaccard_sims["mean_sim"] = df_jaccard_sims.mean(axis=1)
df_jaccard_sims
Out[18]:
In [19]:
nmf_topic_distr = nmf_pipe.transform(corpus.data)
In [20]:
collections_map = {0: "His Last Bow", 1: "The Adventures of Sherlock Holmes",
2: "The Case-Book of Sherlock_Holmes", 3: "The Memoirs of Sherlock Holmes",
4: "The Return of Sherlock Holmes"}
# Titles created from dominant words in topics
novel_collections_map = {0: "The Whispering Ways Sherlock Holmes Waits to Act on Waste",
1: "Vengeful Wednesdays: Unexpected Incidences on the Tapering Train by Sherlock Holmes",
2: "A Private Journey of Sherlock Holmes: Thirteen Unfolded Veins on the Move",
3: "Sherlock Holmes Tumbling into the hanging arms of Scylla",
4: "The Shooking Jaw of Sherlock Holmes in the Villa of the Baronet"}
In [21]:
print("Novel Sherlock Holmes Short Stories Collections:")
for _,title in novel_collections_map.items():
print("*", title)
topics = ["Topic" + str(i) for i in range(n_topics)]
docs = [" ".join(f_name.split("/")[-1].split(".")[0].split("_"))
for f_name in corpus.filenames]
df_document_topic = pd.DataFrame(np.round(nmf_topic_distr, 3), columns=topics, index=docs)
df_document_topic["assigned_topic"] = np.argmax(df_document_topic.values, axis=1)
df_document_topic["orig_collection"] = [collections_map[item] for item in corpus.target]
df_document_topic["novel_collection"] = [novel_collections_map.get(item, item)
for item in df_document_topic.assigned_topic.values]
df_novel_assignment = df_document_topic.sort_values("assigned_topic").loc[:, ["orig_collection",
"novel_collection"]]
df_novel_assignment
Out[21]:
In [22]:
from yellowbrick.text import TSNEVisualizer
tsne = TSNEVisualizer()
tsne.fit(X_tfidf_stem, df_document_topic.novel_collection)
tsne.poof()
A new ordering of short stories from the Sherlock Holmes series into collections based on NMF topic models is possible. Naming of collections according to dominant words in topics is also possible, but they sound strange and doesn't make much sense. The projection of word vectors from the documents looks slightly more structured than the original ordering by the author. Nevertheless the cost of this ordering is that it looses the tension in the canon somehow (eg "The Final Problem" and "The Empty House" are assigned in the same collection). So after all, I'd go with the original ordering by Sir Arthur Conan Doyle.