In this notebook, we'll experiment with HashingVectorizer by making a classifier that predicts whether a text chunk comes from the English Wikipedia articles "Anarchism" or "Anachronism".
In [1]:
import random
import mwapi
import mwparserfromhell as mwparser
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import HashingVectorizer
In [2]:
session = mwapi.Session("https://en.wikipedia.org",
user_agent="Hashing vectorizer example <aaron.halfaker@gmail.com>")
doc = session.get(action="query", prop="revisions", titles=["Anarchism", "Anachronism"], rvprop=['content'])
anarchism_text = doc['query']['pages']['12']['revisions'][0]['*']
anarchonism_text = doc['query']['pages']['60731']['revisions'][0]['*']
In [3]:
observations = []
for text_chunk in mwparser.parse(anarchism_text).filter_text():
text = text_chunk.value
if len(text) > 25:
observations.append((text, "anarchism"))
for text_chunk in mwparser.parse(anarchonism_text).filter_text():
text = text_chunk.value
if len(text) > 25:
observations.append((text, "anachronism"))
print("anarchism paragraphs:", sum(1 for _, label in observations if label == "anarchism"))
print("anachronism paragraphs:", sum(1 for _, label in observations if label == "anachronism"))
In [4]:
random.shuffle(observations)
train_set = observations[:int(len(observations)*0.8)]
test_set = observations[int(len(observations)*0.8):]
len(train_set), len(test_set)
Out[4]:
In [13]:
hv = HashingVectorizer(n_features=2**16)
gbc = GradientBoostingClassifier()
In [14]:
# Training
texts, labels_y = zip(*train_set)
features_X = hv.transform(texts)
gbc.fit(features_X, labels_y,
sample_weight=[119/(119+1433) if l == "anarchism" else 1433/(119+1433) for l in labels_y])
Out[14]:
In [15]:
# Testing
texts, labels_y = zip(*test_set)
features_X = hv.transform(texts)
gbc.score(features_X.todense(), labels_y)
Out[15]:
Statistics are great, but let's look at some example predictions. This loop generates predictions
In [16]:
for text, label in test_set[:10]:
features_X = hv.transform([text])
print("text:", repr(text[:50] + "..."), "\n",
"\tactual:", label, "\n",
"\tprediction:", dict(zip(gbc.classes_,
[int(v*100) for v in gbc.predict_proba(features_X.todense())[0]])))
In [23]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(gbc.feature_importances_, bins=100, log=True)
#plt.semilogy()
plt.title("Feature importance histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
Out[23]:
In [ ]: