Exploration


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

Input


In [2]:
from sklearn.datasets import load_files

raw_corpus = load_files("../data/", shuffle=False)

In [3]:
doc_count = len(raw_corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"


Doc count: 56

In [4]:
print("Beginning:\n", raw_corpus.data[0][:500], "\n")
print("End:\n", raw_corpus.data[0][-200:])


Beginning:
 b"\n\n\n\n                                  HIS LAST BOW\n\n                         An Epilogue of Sherlock Holmes\n\n                               Arthur Conan Doyle\n\n\n\n     It was nine o'clock at night upon the second of August--the most\n     terrible August in the history of the world. One might have thought\n     already that God's curse hung heavy over a degenerate world, for\n     there was an awesome hush and a feeling of vague expectancy in the\n     sultry and stagnant air. The sun had long set, b" 

End:
 b' her up, Watson, for it\'s time that\n     we were on our way. I have a check for five hundred pounds which\n     should be cashed early, for the drawer is quite capable of stopping\n     it if he can."\n\n'

Baseline


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents="ascii", lowercase=True)
X = tfidf.fit_transform(raw_corpus.data)

In [6]:
print(len(tfidf.get_feature_names()))
X.shape


15875
Out[6]:
(56, 15875)

In [7]:
#tfidf.get_feature_names()

Preparation


In [8]:
# Custom wrangler
from helpers.tokenizer import TextWrangler


[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../nltk/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!

Vectorizer


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_pure = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words=None)
X_bow_pure = bow_pure.fit_transform(raw_corpus.data)

bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(raw_corpus.data)

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(raw_corpus.data)

bow_lemma = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="lemma"))
X_bow_lemma = bow_lemma.fit_transform(raw_corpus.data)

tfidf_lemma = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="lemma"))
X_tfidf_lemma = tfidf_lemma.fit_transform(raw_corpus.data)

In [10]:
print("Pure bow vector:", X_bow_pure.shape)
print(bow_pure.get_feature_names()[:10], "...")
print("Stemmed bow vector:", X_bow_stem.shape)
print(bow_stem.get_feature_names()[:10], "...")
print("Lemmatized bow vector:", X_bow_lemma.shape)
print(bow_lemma.get_feature_names()[:10], "...")
print("Stemmed tfidf vector:", X_tfidf_stem.shape)
print(tfidf_stem.get_feature_names()[:10], "...")
print("Lemmatized tfidf vector:", X_tfidf_lemma.shape)
print(tfidf_lemma.get_feature_names()[:10], "...")


Pure bow vector: (56, 15875)
['000', '10', '100', '1000', '104', '10s', '10th', '11', '1100', '117'] ...
Stemmed bow vector: (56, 8270)
['NUM', 'ab', 'aback', 'abandon', 'abba', 'abbess', 'abbey', 'abbot', 'abduc', 'aber'] ...
Lemmatized bow vector: (56, 13739)
['NUM', 'aback', 'abandon', 'abandoned', 'abandoning', 'abated', 'abbas', 'abbess', 'abbey', 'abbot'] ...
Stemmed tfidf vector: (56, 8270)
['NUM', 'ab', 'aback', 'abandon', 'abba', 'abbess', 'abbey', 'abbot', 'abduc', 'aber'] ...
Lemmatized tfidf vector: (56, 13739)
['NUM', 'aback', 'abandon', 'abandoned', 'abandoning', 'abated', 'abbas', 'abbess', 'abbey', 'abbot'] ...

Vectors

Ranking


In [11]:
# Ranking word patterns
def rank(features, values, metric):
    data = []
    for col, term in enumerate(features):
        data.append((term, values[0, col]))

    df = pd.DataFrame(data, columns=["term", metric])
    order = df.sort_values(metric, ascending=False)
    order.reset_index(drop=True, inplace=True)
    order.index = order.index + 1
    return order

In [12]:
assert bow_stem.get_feature_names() == tfidf_stem.get_feature_names(), "Mismatch of stem feature names (->tokenize)"
assert bow_lemma.get_feature_names() == tfidf_lemma.get_feature_names(), "Mismatch of lemma feature names (->tokenize)"
stem_features = bow_stem.get_feature_names()
lemma_features = bow_lemma.get_feature_names()
pure_features = bow_pure.get_feature_names()

bow_stem_sum = X_bow_stem.sum(axis=0)
bow_lemma_sum = X_bow_lemma.sum(axis=0)
bow_pure_sum = X_bow_pure.sum(axis=0)

tfidf_stem_mean = X_tfidf_stem.mean(axis=0)
tfidf_lemma_mean = X_tfidf_lemma.mean(axis=0)

In [13]:
bow_stem_ranked = rank(stem_features, bow_stem_sum, "bow_sum") 
bow_lemma_ranked = rank(lemma_features, bow_lemma_sum, "bow_sum")
bow_pure_ranked = rank(pure_features, bow_pure_sum, "bow_sum")

print("BOW Pure:\n", bow_pure_ranked.head(15), "\n")
print("BOW Stem:\n", bow_stem_ranked.head(15), "\n")
print("BOW Lemma:\n", bow_lemma_ranked.head(15))


BOW Pure:
     term  bow_sum
1    the    24569
2    and    11938
3     of    11283
4     to    11090
5   that     7944
6     it     7628
7     in     7499
8     he     7003
9    was     6984
10   you     6716
11   his     5294
12    is     4746
13   had     4151
14  have     3924
15    my     3895 

BOW Stem:
       term  bow_sum
1     holm     2459
2     said     2109
3      man     1710
4       mr     1392
5      com      965
6     hand      895
7     room      891
8   watson      831
9     know      820
10    look      761
11   littl      749
12     tim      731
13     fac      721
14   think      690
15    hous      655 

BOW Lemma:
       term  bow_sum
1   holmes     2459
2     said     2109
3       mr     1581
4      man     1491
5     room      888
6   watson      831
7     hand      808
8     come      807
9     know      781
10  little      749
11    time      727
12    face      663
13   house      655
14   think      653
15    door      631

In [14]:
tfidf_stem_ranked = rank(stem_features, tfidf_stem_mean, "tfidf_mean") 
tfidf_lemma_ranked = rank(lemma_features, tfidf_lemma_mean, "tfidf_mean")

print("TFIDF Stem:\n", tfidf_stem_ranked.head(15), "\n")
print("TFIDF Lemma:\n", tfidf_lemma_ranked.head(15))


TFIDF Stem:
       term  tfidf_mean
1     holm    0.172742
2     said    0.149197
3      man    0.121634
4       mr    0.096009
5      com    0.068501
6     room    0.064444
7     hand    0.063361
8   watson    0.061516
9     know    0.059246
10    look    0.054095
11   littl    0.053367
12     tim    0.052151
13     fac    0.051594
14   think    0.049320
15    hous    0.048119 

TFIDF Lemma:
       term  tfidf_mean
1   holmes    0.166601
2     said    0.144094
3       mr    0.106475
4      man    0.103182
5     room    0.062111
6   watson    0.059392
7     come    0.055640
8     hand    0.055196
9     know    0.054384
10  little    0.051683
11    time    0.050081
12   house    0.046504
13    face    0.046060
14   think    0.045065
15    door    0.044791

In [15]:
from yellowbrick.text.freqdist import FreqDistVisualizer

visualizer = FreqDistVisualizer(features=pure_features)
visualizer.fit(X_bow_pure)
visualizer.poof()



In [16]:
visualizer = FreqDistVisualizer(features=stem_features)
visualizer.fit(X_bow_stem)
visualizer.poof()



In [17]:
visualizer = FreqDistVisualizer(features=lemma_features)
visualizer.fit(X_bow_lemma)
visualizer.poof()


Distribution


In [18]:
p_df = bow_pure_ranked
p_df.columns = [_, "all pure words in stories"]

ax = p_df.plot(loglog=True)
ax.annotate("Zipfian Distribution", (100, 100), rotation=-33)
plt.plot([0, 1], [1, 0], transform=ax.transAxes, ls="--", c="k")


Out[18]:
[<matplotlib.lines.Line2D at 0x7fb033ff2f60>]

Stemming vs Lemma


In [19]:
p_df1 = bow_lemma_ranked
p_df1.columns = [_, "lemma"]
p_df2 = bow_stem_ranked
p_df2.columns = [_, "stem"]

ax = p_df1.plot(loglog=True)
p_df2.plot(loglog=True, ax=ax)
plt.plot([0, 1], [1, 0], transform=ax.transAxes, ls="--", c="k")


Out[19]:
[<matplotlib.lines.Line2D at 0x7fb033f4e710>]

t-SNE


In [20]:
collections_map = {0: "His_Last_Bow", 1: "The_Adventures_of_Sherlock_Holmes",
                   2: "The_Case-Book_of_Sherlock_Holmes", 3: "The_Memoirs_of_Sherlock_Holmes",
                   4: "The_Return_of_Sherlock_Holmes"}
labels = [collections_map.get(item, item) for item in raw_corpus.target]

Stemmed Tf-idf Vectors


In [21]:
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(X_tfidf_stem, labels)
tsne.poof()


'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.

Lemmatized Tf-idf Vectors


In [22]:
tsne = TSNEVisualizer()
tsne.fit(X_tfidf_lemma, labels)
tsne.poof()


'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.

In [ ]: