In [20]:
from sklearn.datasets import fetch_20newsgroups
# get clean corpus: removing headers, footers, quotes
corpus = fetch_20newsgroups(subset='train', shuffle=True,
remove=('headers', 'footers', 'quotes'))
In [21]:
#explore the corpus structure
print corpus.keys()
print len(corpus.data)
In [22]:
# see which categories exist
print corpus.target_names
In [23]:
# see an example of document
print corpus.data[0]
In [45]:
# Extracting numerical features vectors from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
corpus_train_counts = count_vect.fit_transform(corpus.data)
corpus_train_counts.shape
Out[45]:
In [46]:
# Count: until now we do just counting of words - issue: gives more weight to longer documents than shorter documents
# TF: To avoid this, we can use frequency (TF - Term Frequencies) i.e. #count(word) / #Total words, in each document.
# TF-IDF: We can reduce the weight of more common words like (the, is, an etc.) which occurs in all document. This is called as TF-IDF i.e Term Frequency times inverse document frequency.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
corpus_train_tfidf = tfidf_transformer.fit_transform(corpus_train_counts)
corpus_train_tfidf.shape
Out[46]:
In [50]:
print set(corpus.target)
print len(corpus.target_names)
In [79]:
# train and perform the preprocessing steps in the same pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
In [82]:
# get data
corpus_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))
# run pipeline
predicted = pipeline_fit.predict(corpus_test.data)
In [81]:
# check accuracy
import numpy as np
np.mean(predicted == corpus_test.target)
Out[81]:
In [116]:
# The dummy classifier gives you a measure of "baseline" performance--i.e.
# the success rate one should expect to achieve even if simply guessing.
from sklearn.dummy import DummyClassifier
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', DummyClassifier())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
#predict
predicted = pipeline_fit.predict(corpus_test.data)
# check accuracy
np.mean(predicted == corpus_test.target)
Out[116]:
In [85]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
# run pipeline
predicted = pipeline_fit.predict(corpus_test.data)
# check accuracy
np.mean(predicted == corpus_test.target)
Out[85]:
In [86]:
# Looks like just by removing stop words we got some small performance improvement.
In [93]:
# create stemmed version of countvectorizer
import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
# run pipeline in training data
pipeline = Pipeline([('vect', stemmed_count_vect),
('tfidf', TfidfTransformer()),
('mnb', MultinomialNB(fit_prior=False))])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
# predict
predicted = pipeline_fit.predict(corpus_test.data)
# evaluate
np.mean(predicted == corpus_test.target)
Out[93]:
In [112]:
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-knn', KNeighborsClassifier(n_neighbors=500))])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
#predict
predicted = pipeline_fit.predict(corpus_test.data)
# evaluate
np.mean(predicted == corpus_test.target)
Out[112]:
In [114]:
# A linear support vector machine (SVM) is widely regarded as one of the best text classification algorithms
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)
#predict
predicted = pipeline_fit.predict(corpus_test.data)
# evaluate
np.mean(predicted == corpus_test.target)
Out[114]:
In [92]:
# Perform hyperparameter tunnings in case one uses algorithms with tunable parameters
In [115]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(corpus_test.target, predicted)
Out[115]:
In [96]:
from sklearn import metrics
print(metrics.classification_report(
corpus_test.target,
predicted,
target_names=corpus_test.target_names))
In [98]:
metrics.confusion_matrix(corpus_test.target, predicted)
Out[98]:
In [ ]: