In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn.datasets import load_files
raw_corpus = load_files("../data/", shuffle=False)
In [3]:
doc_count = len(raw_corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"
In [4]:
print("Beginning:\n", raw_corpus.data[0][:500], "\n")
print("End:\n", raw_corpus.data[0][-200:])
In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents="ascii", lowercase=True)
X = tfidf.fit_transform(raw_corpus.data)
In [6]:
print(len(tfidf.get_feature_names()))
X.shape
Out[6]:
In [7]:
#tfidf.get_feature_names()
In [8]:
# Custom wrangler
from helpers.tokenizer import TextWrangler
In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
bow_pure = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words=None)
X_bow_pure = bow_pure.fit_transform(raw_corpus.data)
bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(raw_corpus.data)
tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(raw_corpus.data)
bow_lemma = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="lemma"))
X_bow_lemma = bow_lemma.fit_transform(raw_corpus.data)
tfidf_lemma = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="lemma"))
X_tfidf_lemma = tfidf_lemma.fit_transform(raw_corpus.data)
In [10]:
print("Pure bow vector:", X_bow_pure.shape)
print(bow_pure.get_feature_names()[:10], "...")
print("Stemmed bow vector:", X_bow_stem.shape)
print(bow_stem.get_feature_names()[:10], "...")
print("Lemmatized bow vector:", X_bow_lemma.shape)
print(bow_lemma.get_feature_names()[:10], "...")
print("Stemmed tfidf vector:", X_tfidf_stem.shape)
print(tfidf_stem.get_feature_names()[:10], "...")
print("Lemmatized tfidf vector:", X_tfidf_lemma.shape)
print(tfidf_lemma.get_feature_names()[:10], "...")
In [11]:
# Ranking word patterns
def rank(features, values, metric):
data = []
for col, term in enumerate(features):
data.append((term, values[0, col]))
df = pd.DataFrame(data, columns=["term", metric])
order = df.sort_values(metric, ascending=False)
order.reset_index(drop=True, inplace=True)
order.index = order.index + 1
return order
In [12]:
assert bow_stem.get_feature_names() == tfidf_stem.get_feature_names(), "Mismatch of stem feature names (->tokenize)"
assert bow_lemma.get_feature_names() == tfidf_lemma.get_feature_names(), "Mismatch of lemma feature names (->tokenize)"
stem_features = bow_stem.get_feature_names()
lemma_features = bow_lemma.get_feature_names()
pure_features = bow_pure.get_feature_names()
bow_stem_sum = X_bow_stem.sum(axis=0)
bow_lemma_sum = X_bow_lemma.sum(axis=0)
bow_pure_sum = X_bow_pure.sum(axis=0)
tfidf_stem_mean = X_tfidf_stem.mean(axis=0)
tfidf_lemma_mean = X_tfidf_lemma.mean(axis=0)
In [13]:
bow_stem_ranked = rank(stem_features, bow_stem_sum, "bow_sum")
bow_lemma_ranked = rank(lemma_features, bow_lemma_sum, "bow_sum")
bow_pure_ranked = rank(pure_features, bow_pure_sum, "bow_sum")
print("BOW Pure:\n", bow_pure_ranked.head(15), "\n")
print("BOW Stem:\n", bow_stem_ranked.head(15), "\n")
print("BOW Lemma:\n", bow_lemma_ranked.head(15))
In [14]:
tfidf_stem_ranked = rank(stem_features, tfidf_stem_mean, "tfidf_mean")
tfidf_lemma_ranked = rank(lemma_features, tfidf_lemma_mean, "tfidf_mean")
print("TFIDF Stem:\n", tfidf_stem_ranked.head(15), "\n")
print("TFIDF Lemma:\n", tfidf_lemma_ranked.head(15))
In [15]:
from yellowbrick.text.freqdist import FreqDistVisualizer
visualizer = FreqDistVisualizer(features=pure_features)
visualizer.fit(X_bow_pure)
visualizer.poof()
In [16]:
visualizer = FreqDistVisualizer(features=stem_features)
visualizer.fit(X_bow_stem)
visualizer.poof()
In [17]:
visualizer = FreqDistVisualizer(features=lemma_features)
visualizer.fit(X_bow_lemma)
visualizer.poof()
In [18]:
p_df = bow_pure_ranked
p_df.columns = [_, "all pure words in stories"]
ax = p_df.plot(loglog=True)
ax.annotate("Zipfian Distribution", (100, 100), rotation=-33)
plt.plot([0, 1], [1, 0], transform=ax.transAxes, ls="--", c="k")
Out[18]:
Stemming vs Lemma
In [19]:
p_df1 = bow_lemma_ranked
p_df1.columns = [_, "lemma"]
p_df2 = bow_stem_ranked
p_df2.columns = [_, "stem"]
ax = p_df1.plot(loglog=True)
p_df2.plot(loglog=True, ax=ax)
plt.plot([0, 1], [1, 0], transform=ax.transAxes, ls="--", c="k")
Out[19]:
In [20]:
collections_map = {0: "His_Last_Bow", 1: "The_Adventures_of_Sherlock_Holmes",
2: "The_Case-Book_of_Sherlock_Holmes", 3: "The_Memoirs_of_Sherlock_Holmes",
4: "The_Return_of_Sherlock_Holmes"}
labels = [collections_map.get(item, item) for item in raw_corpus.target]
Stemmed Tf-idf Vectors
In [21]:
from yellowbrick.text import TSNEVisualizer
tsne = TSNEVisualizer()
tsne.fit(X_tfidf_stem, labels)
tsne.poof()
Lemmatized Tf-idf Vectors
In [22]:
tsne = TSNEVisualizer()
tsne.fit(X_tfidf_lemma, labels)
tsne.poof()
In [ ]: