In this notebook, we'll walk through some simple natural language processing techniques and work towards building a text classification model. Through this process we'll utilize the data science pipeline:
Ingestion → Wrangling → Analysis → Modeling → Visualization
The basic principle will be to fetch HTML data from web pages, then extract the text from it. We will then apply tokenization and tagging to the text to create a basic data structure. In preparation for modeling we'll normalize our text using lemmatization, then remove stopwords and punctuation. After that we'll vectorize our text, then send it to our classification model, which we will evaluate with cross validation.
For now, we'll simply ingest news articles from the Washington Post by looking up their ID from the short URL.
In [ ]:
%matplotlib inline
In [ ]:
import os
import requests
WAPO = "http://wpo.st/"
def fetch_wapo(sid="ciSa2"):
url = WAPO + sid
res = requests.get(url)
return res.text
story = fetch_wapo()
In [ ]:
print(story)
The HTML that we fetched contains navigation, advertisements, and markup not related to the text. We need to clean it up to extract only the part of the document we're interested in analyzing.
Note that this is also the point that we should consider larger document structures like chapters, sections, or paragraphs. If we want to consider paragraphs, the extract
function should return a list of strings that each represent a paragraph.
In [ ]:
from bs4 import BeautifulSoup
from readability.readability import Document
def extract(html):
article = Document(html).summary()
soup = BeautifulSoup(article, 'lxml')
return soup.get_text()
story = extract(story)
In [ ]:
print(story)
In [ ]:
import nltk
def tokenize(text):
for sent in nltk.sent_tokenize(text):
yield list(nltk.word_tokenize(sent))
story = list(tokenize(story))
In [ ]:
for sent in story: print(sent)
Tagging adds information to the data structure we have -- namely the word class for each word (e.g. is it a Noun, Verb, Adjective, etc.). Note that tagging needs a complete sentence to work effectively.
After we have tagged our text, we have completed the non-destructive operations on our text string, it is at this point that the text should be saved as a pickle to disk for use in downstream processing.
In [ ]:
def tag(sents):
for sent in sents:
yield list(nltk.pos_tag(sent))
story = list(tag(story))
In [ ]:
for sent in story: print(sent)
In [ ]:
from nltk.corpus import wordnet as wn
lemmatizer = nltk.WordNetLemmatizer()
def tagwn(tag):
return {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
def lemmatize(tagged_sents):
for sent in tagged_sents:
for token, tag in sent:
yield lemmatizer.lemmatize(token, tagwn(tag))
story = list(lemmatize(story))
In [ ]:
print(story)
In [ ]:
from string import punctuation
from nltk.corpus import stopwords
punctuation = set(punctuation)
stopwords = set(stopwords.words('english'))
def normalize(tokens):
for token in tokens:
token = token.lower()
if not all(char in punctuation for char in token):
if token not in stopwords:
yield token
story = list(normalize(story))
In [ ]:
print(story)
Building models requires gathering multiple documents and performing the processing steps on them that we showed above. We've used a tool called Baleen to ingest data from RSS feeds for the past year. (It currently contains 1,154,100 posts for 373 feeds after 5,566 jobs).
We've provided a small sample of the corpus to start playing with the tool. It has saved documents in the following structure:
We can then create a reader to automatically fetch data from our corpus. This is a bit more complex, but necessary. Also note that we add our normalization process here as well, just so we don't have to repeat steps later on.
In [ ]:
import string
import pickle
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader
CORPUS_PATH = "data/baleen_sample"
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'
class PickledCorpus(CategorizedCorpusReader, CorpusReader):
def __init__(self, root, fileids=PKL_PATTERN, cat_pattern=CAT_PATTERN):
CategorizedCorpusReader.__init__(self, {"cat_pattern": cat_pattern})
CorpusReader.__init__(self, root, fileids)
self.punct = set(string.punctuation) | {'“', '—', '’', '”', '…'}
self.stopwords = set(nltk.corpus.stopwords.words('english'))
self.wordnet = nltk.WordNetLemmatizer()
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError("Specify fileids or categories, not both")
if categories is not None:
return self.fileids(categories=categories)
if fileids is None:
return self.fileids()
return fileids
def lemmatize(self, token, tag):
token = token.lower()
if token not in self.stopwords:
if not all(c in self.punct for c in token):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.wordnet.lemmatize(token, tag)
def tokenize(self, doc):
# Expects a preprocessed document, removes stopwords and punctuation
# makes all tokens lowercase and lemmatizes them.
return list(filter(None, [
self.lemmatize(token, tag)
for paragraph in doc
for sentence in paragraph
for token, tag in sentence
]))
def docs(self, fileids=None, categories=None):
# Resolve the fileids and the categories
fileids = self._resolve(fileids, categories)
# Create a generator, loading one document into memory at a time.
for path, enc, fileid in self.abspaths(fileids, True, True):
with open(path, 'rb') as f:
yield self.tokenize(pickle.load(f))
def labels(self, fileids=None, categories=None):
fileids = self._resolve(fileids, categories)
for fid in fileids:
yield self.categories(fid)[0]
In [ ]:
corpus = PickledCorpus('data/baleen_sample')
In [ ]:
print("{} documents in {} categories".format(len(corpus.fileids()), len(corpus.categories())))
In [ ]:
from nltk import ConditionalFreqDist
words = ConditionalFreqDist()
for doc, label in zip(corpus.docs(), corpus.labels()):
for word in doc:
words[label][word] += 1
In [ ]:
for label, counts in words.items():
print("{}: {:,} vocabulary and {:,} words".format(
label, len(counts), sum(counts.values())
))
In [ ]:
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
cluster = Pipeline([
('vect', CountVectorizer(tokenizer=lambda x: x, preprocessor=None, lowercase=False)),
('svd', TruncatedSVD(n_components=50)),
('tsne', TSNE(n_components=2))
])
docs = cluster.fit_transform(list(corpus.docs()))
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
sns.set_style('whitegrid')
sns.set_context('notebook')
colors = {
"design": "#e74c3c",
"tech": "#3498db",
"business": "#27ae60",
"gaming": "#f1c40f",
"politics": "#2c3e50",
"news": "#bdc3c7",
"cooking": "#d35400",
"data_science": "#1abc9c",
"sports": "#e67e22",
"cinema": "#8e44ad",
"books": "#c0392b",
"do_it_yourself": "#34495e",
}
series = defaultdict(lambda: {'x':[], 'y':[]})
for idx, label in enumerate(corpus.labels()):
x, y = docs[idx]
series[label]['x'].append(x)
series[label]['y'].append(y)
fig = plt.figure(figsize=(12,6))
ax = plt.subplot(111)
for label, points in series.items():
ax.scatter(points['x'], points['y'], c=colors[label], alpha=0.7, label=label)
# Add a title
plt.title("TSNE Projection of the Baleen Corpus")
# Remove the ticks
plt.yticks([])
plt.xticks([])
# Add the legend
# Shrink current axis by 20%
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
We'll build a model that can classify what hobby a document is about based on our sample corpus.
We'll need to add transformers that vectorize our text, and send them to a classification model.
In this case we will evaluate with 12-part cross validation, using the cross_val_predict
function and the classifier_report
function.
The function cross_val_predict
has a similar interface to cross_val_score
, but returns, for each element in the input, the prediction that was obtained for that element when it was in the test set. Only cross-validation strategies that assign all elements to a test set exactly once can be used (otherwise, an exception is raised).
In [ ]:
hobbies = ['gaming', 'cooking', 'sports', 'cinema', 'books', 'do_it_yourself']
X = list(corpus.docs(categories=hobbies))
y = list(corpus.labels(categories=hobbies))
In [ ]:
# Models
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
# Transformers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
# Evaluation
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
def identity(words):
return words
In [ ]:
# SVM Classifier
svm = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('svm', SGDClassifier()),
])
yhat = cross_val_predict(svm, X, y, cv=12)
print(classification_report(y, yhat))
In [ ]:
# Logistic Regression
logit = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('logit', LogisticRegression()),
])
yhat = cross_val_predict(logit, X, y, cv=12)
print(classification_report(y, yhat))
In [ ]:
# Naive Bayes
nbayes = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('nbayes', MultinomialNB()),
])
yhat = cross_val_predict(nbayes, X, y, cv=12)
print(classification_report(y, yhat))
In [ ]:
# Random Forest
trees = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('trees', RandomForestClassifier()),
])
yhat = cross_val_predict(trees, X, y, cv=12)
print(classification_report(y, yhat))
At this point we can save our best performing model to disk and use it to classify new text.
The most important thing to remember is that the input to our model needs to be identical to the input we trained our model upon. Because we preprocessed our text in the experimental phase, we have to preprocess it before we make predictions on it as well.
In [ ]:
def build_model(path, corpus):
model = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('svm', SGDClassifier(loss='log')),
])
# Train model on the entire data set
X = list(corpus.docs(categories=hobbies))
y = list(corpus.labels(categories=hobbies))
model.fit(X, y)
with open(path, 'wb') as f:
pickle.dump(model, f)
build_model('data/hobbies.classifier', corpus)
In [ ]:
# We can now load our model from disk
with open('data/hobbies.classifier', 'rb') as f:
model = pickle.load(f)
In [ ]:
# Let's create a normalization method for fetching URL content
# that our model expects, based on our methods above.
def fetch(url):
html = requests.get(url)
text = extract(html.text)
tokens = tokenize(text)
tags = tag(tokens)
lemmas = lemmatize(tags)
return list(normalize(lemmas))
In [ ]:
def predict(url):
text = fetch(url)
probs = zip(model.classes_, model.predict_proba([text])[0])
label = model.predict([text])[0]
print("y={}".format(label))
for cls, prob in sorted(probs, key=lambda x: x[1]):
print(" {}: {:0.3f}".format(cls, prob))
In [ ]:
predict("http://minimalistbaker.com/5-ingredient-white-chocolate-truffles/")