Modeling

ML Tasks


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

Input


In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"


Doc count: 56

Vectorizer


In [3]:
from helpers.tokenizer import TextWrangler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(corpus.data)

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(corpus.data)


[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../nltk/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!

Models


In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 5

lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_decay=0.5, learning_offset=1.,
                                random_state=23)
lsa = TruncatedSVD(n_components=n_topics, random_state=23)
nmf = NMF(n_components=n_topics, solver="mu", beta_loss="kullback-leibler", alpha=0.1, random_state=23)

In [5]:
lda_params = {"lda__learning_decay": [0.5, 0.7, 0.9],
              "lda__learning_offset": [1., 5., 10.]}

Pipelines


In [6]:
from sklearn.pipeline import Pipeline

lda_pipe = Pipeline([
    ("bow", bow_stem),
    ("lda", lda)
])

lsa_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("lsa", lsa)
])

nmf_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("nmf", nmf)
])

Gridsearch


In [7]:
from sklearn.model_selection import GridSearchCV

lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)
#lda_model.fit(corpus.data)
#lda_model.best_params_

Training


In [8]:
lda_pipe.fit(corpus.data)
nmf_pipe.fit(corpus.data)
lsa_pipe.fit(corpus.data)


Out[8]:
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...', TruncatedSVD(algorithm='randomized', n_components=5, n_iter=5,
       random_state=23, tol=0.0))])

Evaluation


In [9]:
print("LDA")
print("Log Likelihood:", lda_pipe.score(corpus.data))


LDA
Log Likelihood: -1281860.968742299

Visual Inspection


In [10]:
def df_topic_model(vectorizer, model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    return df_topic_keywords

In [11]:
print("LDA")
df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=15)


LDA
Out[11]:
Word 0 Word 1 Word 2 Word 3 Word 4 Word 5 Word 6 Word 7 Word 8 Word 9 Word 10 Word 11 Word 12 Word 13 Word 14
Topic 0 room com fac hand hous know littl wom think look way lady say door mat
Topic 1 hand know com let room look tim sir think littl say fac door cas cam
Topic 2 com room know hand fac tim day cas way look think door littl window sir
Topic 3 com hand room littl look tim night hous op good door know think old cam
Topic 4 com look know cas turn left young road littl think day let hand fac shal

In [12]:
print("LSA")
df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=15)


LSA
Out[12]:
Word 0 Word 1 Word 2 Word 3 Word 4 Word 5 Word 6 Word 7 Word 8 Word 9 Word 10 Word 11 Word 12 Word 13 Word 14
Topic 0 com room hand know look littl tim fac think hous door let cas way cam
Topic 1 lestrad bust oldacr mccarthy mcfarlane norwood mycroft sarah napoleon jona pearl moriarty beppo hark pap
Topic 2 godfrey colonel staunton straker cunningham hors emswor stabl moor armstrong overton bicyc jam ross alec
Topic 3 hopkin trev blessington carey stanley hut room smi harpoon captain tregen pet stackhurst ship moriarty
Topic 4 robert godfrey lestrad ferguson staunton mccarthy lady shoscomb emswor rond mason leonardo coffin crypt hopkin

In [13]:
print("NMF")
df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=15)


NMF
Out[13]:
Word 0 Word 1 Word 2 Word 3 Word 4 Word 5 Word 6 Word 7 Word 8 Word 9 Word 10 Word 11 Word 12 Word 13 Word 14
Topic 0 wast wait think turn us valu thought watch act whisp thing adv ask way vent
Topic 1 tap vast wednesday veng shav untidy shal train unexpect unwieldy incid track valet van ush
Topic 2 mov vein tast thirteen wish sunk unfold upward parl leav journey priv victor porch unsight
Topic 3 scyll tumbl upset arm stat suff passeng sur tre test account tawny reach hang bind
Topic 4 read shook ut tackl til tru vehic tel triumph arm villa jaw baronet unev moth

In [14]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()

prepare(lda_pipe.named_steps.lda, X_bow_stem, bow_stem, mds="tsne")


/home/datadonk23/anaconda3/envs/holmes-topic-models/lib/python3.6/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))
Out[14]:

In [15]:
prepare(nmf_pipe.named_steps.nmf, X_tfidf_stem, tfidf_stem, mds="tsne")


/home/datadonk23/anaconda3/envs/holmes-topic-models/lib/python3.6/site-packages/pyLDAvis/_prepare.py:223: RuntimeWarning: divide by zero encountered in log
  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
/home/datadonk23/anaconda3/envs/holmes-topic-models/lib/python3.6/site-packages/pyLDAvis/_prepare.py:240: RuntimeWarning: divide by zero encountered in log
  log_lift = np.log(topic_term_dists / term_proportion)
/home/datadonk23/anaconda3/envs/holmes-topic-models/lib/python3.6/site-packages/pyLDAvis/_prepare.py:241: RuntimeWarning: divide by zero encountered in log
  log_ttd = np.log(topic_term_dists)
/home/datadonk23/anaconda3/envs/holmes-topic-models/lib/python3.6/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))
Out[15]:

Conclusion:

Topic models derived from different approaches look dissimilar. Top word distribution of NMF appears most meaningful, mostly because its topics doesn't share same words (due to NMF algorithm). LSA topic model is better interpretable than its LDA counterpart. Nonetheless, topics from both are hard to distinguish and doesn't make much sense. Therefore I'll go with the NMF topic model for the assginment to novel collections step.

Jaccard Index


In [16]:
df_topic_word_lda = df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=10)
df_topic_word_lsa = df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=10)
df_topic_word_nmf = df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=10)

In [17]:
def jaccard_index(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    jaccard_index = len(s1.intersection(s2)) / len(s1.union(s2))
    return jaccard_index

In [18]:
sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf = {}, {}, {}
assert df_topic_word_lda.shape[0] == df_topic_word_lsa.shape[0] == df_topic_word_nmf.shape[0], "n_topics mismatch"

for ix, row in df_topic_word_lda.iterrows(): 
    l1 = df_topic_word_lda.loc[ix, :].values.tolist()
    l2 = df_topic_word_lsa.loc[ix, :].values.tolist()
    l3 = df_topic_word_nmf.loc[ix, :].values.tolist()
    sims_lda_lsa[ix] = jaccard_index(l1, l2)
    sims_lda_nmf[ix] = jaccard_index(l1, l3)
    sims_lsa_nmf[ix] = jaccard_index(l2, l3)

df_jaccard_sims = pd.DataFrame([sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf])
df_jaccard_sims.index = ["LDA vs LSA", "LDA vs NMF", "LSA vs NMF"]
df_jaccard_sims["mean_sim"] = df_jaccard_sims.mean(axis=1)
df_jaccard_sims


Out[18]:
Topic 0 Topic 1 Topic 2 Topic 3 Topic 4 mean_sim
LDA vs LSA 0.818182 0.0 0.0 0.052632 0.0 0.174163
LDA vs NMF 0.052632 0.0 0.0 0.000000 0.0 0.010526
LSA vs NMF 0.052632 0.0 0.0 0.000000 0.0 0.010526

Conclusion:

Topics derived from different topic modeling approaches are fundamentally dissimilar.

Document-topic Assignment


In [19]:
nmf_topic_distr = nmf_pipe.transform(corpus.data)

In [20]:
collections_map = {0: "His Last Bow", 1: "The Adventures of Sherlock Holmes",
                   2: "The Case-Book of Sherlock_Holmes", 3: "The Memoirs of Sherlock Holmes",
                   4: "The Return of Sherlock Holmes"}

# Titles created from dominant words in topics
novel_collections_map = {0: "The Whispering Ways Sherlock Holmes Waits to Act on Waste", 
                         1: "Vengeful Wednesdays: Unexpected Incidences on the Tapering Train by Sherlock Holmes",
                         2: "A Private Journey of Sherlock Holmes: Thirteen Unfolded Veins on the Move",
                         3: "Sherlock Holmes Tumbling into the hanging arms of Scylla",
                         4: "The Shooking Jaw of Sherlock Holmes in the Villa of the Baronet"}

In [21]:
print("Novel Sherlock Holmes Short Stories Collections:")
for _,title in novel_collections_map.items():
    print("*", title)

topics = ["Topic" + str(i) for i in range(n_topics)]
docs = [" ".join(f_name.split("/")[-1].split(".")[0].split("_")) 
        for f_name in corpus.filenames]

df_document_topic = pd.DataFrame(np.round(nmf_topic_distr, 3), columns=topics, index=docs)
df_document_topic["assigned_topic"] = np.argmax(df_document_topic.values, axis=1)
df_document_topic["orig_collection"] = [collections_map[item] for item in corpus.target]
df_document_topic["novel_collection"] = [novel_collections_map.get(item, item) 
                                         for item in df_document_topic.assigned_topic.values]

df_novel_assignment = df_document_topic.sort_values("assigned_topic").loc[:, ["orig_collection", 
                                                                              "novel_collection"]]
df_novel_assignment


Novel Sherlock Holmes Short Stories Collections:
* The Whispering Ways Sherlock Holmes Waits to Act on Waste
* Vengeful Wednesdays: Unexpected Incidences on the Tapering Train by Sherlock Holmes
* A Private Journey of Sherlock Holmes: Thirteen Unfolded Veins on the Move
* Sherlock Holmes Tumbling into the hanging arms of Scylla
* The Shooking Jaw of Sherlock Holmes in the Villa of the Baronet
Out[21]:
orig_collection novel_collection
THE ADVENTURE OF CHARLES AUGUSTUS MILVERTON The Return of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE THREE STUDENTS The Return of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE NAVAL TREATY The Memoirs of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE COPPER BEECHES The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE RED CIRCLE His Last Bow The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE SPECKLED BAND The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE YELLOW FACE The Memoirs of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE MAN WITH THE TWISTED LIP The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE RETIRED COLOURMAN The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE THREE GARRIDEBS The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF WISTERIA LODGE His Last Bow The Whispering Ways Sherlock Holmes Waits to A...
A SCANDAL IN BOHEMIA The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE MAZARIN STONE The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE MUSGRAVE RITUAL The Memoirs of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE STOCK-BROKER'S CLERK The Memoirs of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE BERYL CORONET The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
A CASE OF IDENTITY The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
HIS LAST BOW His Last Bow The Whispering Ways Sherlock Holmes Waits to A...
THE ILLUSTRIOUS CLIENT The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE DANCING MEN The Return of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE REIGATE SQUIRES The Memoirs of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE CREEPING MAN The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE THREE GABLES The Case-Book of Sherlock_Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE BLUE CARBUNCLE The Adventures of Sherlock Holmes The Whispering Ways Sherlock Holmes Waits to A...
THE ADVENTURE OF THE SIX NAPOLEONS The Return of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE BRUCE-PARTINGTON PLANS His Last Bow Vengeful Wednesdays: Unexpected Incidences on ...
THE BOSCOMBE VALLEY MYSTERY The Adventures of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE SECOND STAIN The Return of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE NORWOOD BUILDER The Return of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE NOBLE BACHELOR The Adventures of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE RED-HEADED LEAGUE The Adventures of Sherlock Holmes Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE CARDBOARD BOX His Last Bow Vengeful Wednesdays: Unexpected Incidences on ...
THE ADVENTURE OF THE PRIORY SCHOOL The Return of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE GREEK INTERPRETER The Memoirs of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE BLANCHED SOLDIER The Case-Book of Sherlock_Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE ADVENTURE OF THE ENGINEER'S THUMB The Adventures of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE CROOKED MAN The Memoirs of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE ADVENTURE OF THE EMPTY HOUSE The Return of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE FINAL PROBLEM The Memoirs of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
SILVER BLAZE The Memoirs of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE ADVENTURE OF THE MISSING THREE-QUARTER The Return of Sherlock Holmes A Private Journey of Sherlock Holmes: Thirteen...
THE FIVE ORANGE PIPS The Adventures of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE RESIDENT PATIENT The Memoirs of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE DEVIL'S FOOT His Last Bow Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF BLACK PETER The Return of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE DYING DETECTIVE His Last Bow Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE ABBEY GRANGE The Return of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE GLORIA SCOTT The Memoirs of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE GOLDEN PINCE-NEZ The Return of Sherlock Holmes Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE LION'S MANE The Case-Book of Sherlock_Holmes Sherlock Holmes Tumbling into the hanging arms...
THE ADVENTURE OF THE VEILED LODGER The Case-Book of Sherlock_Holmes The Shooking Jaw of Sherlock Holmes in the Vil...
THE ADVENTURE OF THE SUSSEX VAMPIRE The Case-Book of Sherlock_Holmes The Shooking Jaw of Sherlock Holmes in the Vil...
THE DISAPPEARANCE OF LADY FRANCES CARFAX His Last Bow The Shooking Jaw of Sherlock Holmes in the Vil...
THE ADVENTURE OF THE SOLITARY CYCLIST The Return of Sherlock Holmes The Shooking Jaw of Sherlock Holmes in the Vil...
THE ADVENTURE OF SHOSCOMBE OLD PLACE The Case-Book of Sherlock_Holmes The Shooking Jaw of Sherlock Holmes in the Vil...
THE PROBLEM OF THOR BRIDGE The Case-Book of Sherlock_Holmes The Shooking Jaw of Sherlock Holmes in the Vil...

In [22]:
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(X_tfidf_stem, df_document_topic.novel_collection)
tsne.poof()


'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.

Conclusion:

A new ordering of short stories from the Sherlock Holmes series into collections based on NMF topic models is possible. Naming of collections according to dominant words in topics is also possible, but they sound strange and doesn't make much sense. The projection of word vectors from the documents looks slightly more structured than the original ordering by the author. Nevertheless the cost of this ordering is that it looses the tension in the canon somehow (eg "The Final Problem" and "The Empty House" are assigned in the same collection). So after all, I'd go with the original ordering by Sir Arthur Conan Doyle.