Get Data


In [20]:
from sklearn.datasets import fetch_20newsgroups
# get clean corpus: removing headers, footers, quotes
corpus = fetch_20newsgroups(subset='train', shuffle=True,
                            remove=('headers', 'footers', 'quotes'))

In [21]:
#explore the corpus structure
print corpus.keys()
print len(corpus.data)


['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']
11314

In [22]:
# see which categories exist
print corpus.target_names


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

In [23]:
# see an example of document
print corpus.data[0]


I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Data Preprocess


In [45]:
# Extracting numerical features vectors from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
corpus_train_counts = count_vect.fit_transform(corpus.data)
corpus_train_counts.shape


Out[45]:
(11314, 101631)

In [46]:
# Count: until now we do just counting of words - issue: gives more weight to longer documents than shorter documents
# TF: To avoid this, we can use frequency (TF - Term Frequencies) i.e. #count(word) / #Total words, in each document.
# TF-IDF: We can reduce the weight of more common words like (the, is, an etc.) which occurs in all document. This is called as TF-IDF i.e Term Frequency times inverse document frequency.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
corpus_train_tfidf = tfidf_transformer.fit_transform(corpus_train_counts)
corpus_train_tfidf.shape


Out[46]:
(11314, 101631)

Train ML


In [50]:
print set(corpus.target)
print len(corpus.target_names)


set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
20

In [79]:
# train and perform the preprocessing steps in the same pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

Predict ML


In [82]:
# get data
corpus_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))

# run pipeline 
predicted = pipeline_fit.predict(corpus_test.data)

Evaluate Performance


In [81]:
# check accuracy
import numpy as np
np.mean(predicted == corpus_test.target)


Out[81]:
0.60621348911311734

Baselines


In [116]:
# The dummy classifier gives you a measure of "baseline" performance--i.e. 
# the success rate one should expect to achieve even if simply guessing.
from sklearn.dummy import DummyClassifier
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', DummyClassifier())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# check accuracy
np.mean(predicted == corpus_test.target)


Out[116]:
0.050318640467339353

Improvements

1. removing stopwords


In [85]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

# run pipeline 
predicted = pipeline_fit.predict(corpus_test.data)

# check accuracy
np.mean(predicted == corpus_test.target)


Out[85]:
0.67790759426447156

In [86]:
# Looks like just by removing stop words we got some small performance improvement.

2. Stemming


In [93]:
# create stemmed version of countvectorizer
import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

# run pipeline in training data
pipeline = Pipeline([('vect', stemmed_count_vect), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

# predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)


Out[93]:
0.67843866171003719

4. Try other models


In [112]:
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf-knn', KNeighborsClassifier(n_neighbors=500))])

pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)


Out[112]:
0.40560276155071695

5. More advanced models SVM


In [114]:
# A linear support vector machine (SVM) is widely regarded as one of the best text classification algorithms 
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)


Out[114]:
0.68361656930430159

In [92]:
# Perform hyperparameter tunnings in case one uses algorithms with tunable parameters

More Evaluation Metrics


In [115]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(corpus_test.target, predicted)


Out[115]:
0.66652234784910114

In [96]:
from sklearn import metrics
print(metrics.classification_report(
    corpus_test.target,
    predicted,
    target_names=corpus_test.target_names))


                          precision    recall  f1-score   support

             alt.atheism       0.56      0.42      0.48       319
           comp.graphics       0.69      0.67      0.68       389
 comp.os.ms-windows.misc       0.67      0.60      0.63       394
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       392
   comp.sys.mac.hardware       0.76      0.68      0.72       385
          comp.windows.x       0.74      0.71      0.73       395
            misc.forsale       0.48      0.85      0.61       390
               rec.autos       0.79      0.70      0.74       396
         rec.motorcycles       0.73      0.77      0.75       398
      rec.sport.baseball       0.82      0.78      0.80       397
        rec.sport.hockey       0.82      0.91      0.86       399
               sci.crypt       0.71      0.74      0.73       396
         sci.electronics       0.67      0.49      0.57       393
                 sci.med       0.76      0.79      0.78       396
               sci.space       0.70      0.76      0.73       394
  soc.religion.christian       0.61      0.82      0.70       398
      talk.politics.guns       0.56      0.70      0.62       364
   talk.politics.mideast       0.74      0.82      0.78       376
      talk.politics.misc       0.70      0.35      0.47       310
      talk.religion.misc       0.52      0.12      0.20       251

             avg / total       0.69      0.68      0.67      7532


In [98]:
metrics.confusion_matrix(corpus_test.target, predicted)


Out[98]:
array([[135,   0,   2,   2,   0,   2,  13,   4,   9,   5,   4,   4,   4,
          8,  20,  73,   6,  20,   0,   8],
       [  7, 262,  20,  12,   7,  22,  12,   1,   5,   3,   0,  14,   5,
          2,  13,   1,   1,   2,   0,   0],
       [  2,  15, 238,  34,  18,  23,  18,   1,   3,   4,   2,   4,   1,
          8,  11,   1,   4,   1,   4,   2],
       [  0,  14,  26, 256,  21,   9,  25,   2,   1,   1,   2,   9,  21,
          1,   1,   0,   0,   2,   1,   0],
       [  0,   6,   6,  35, 261,   7,  30,   6,   9,   0,   3,   5,   9,
          2,   4,   1,   1,   0,   0,   0],
       [  0,  37,  33,   4,   6, 282,  14,   0,   2,   1,   0,   6,   3,
          1,   5,   0,   0,   1,   0,   0],
       [  0,   3,   0,  12,   9,   0, 330,   7,   6,   2,   2,   1,   5,
          0,   6,   1,   5,   1,   0,   0],
       [  6,   1,   4,   1,   1,   3,  37, 276,  21,   3,   3,   2,  16,
          2,   5,   0,   8,   5,   2,   0],
       [  2,   0,   1,   1,   1,   0,  24,  16, 307,   4,   0,   1,   5,
          7,   8,   4,   6,   5,   4,   2],
       [  3,   1,   0,   0,   1,   1,  22,   2,   5, 311,  36,   1,   1,
          2,   3,   4,   0,   0,   4,   0],
       [  2,   2,   0,   1,   0,   0,   9,   0,   2,   7, 364,   1,   0,
          2,   2,   1,   4,   0,   0,   2],
       [  3,   5,   5,   3,   4,   6,  20,   1,   3,   6,   1, 294,   5,
          4,   7,   2,  15,   6,   4,   2],
       [  1,  11,   8,  22,   8,  11,  32,  11,  14,   7,   5,  37, 194,
         14,  12,   2,   2,   2,   0,   0],
       [  1,   7,   1,   0,   0,   2,  23,   3,   5,   1,   7,   2,   7,
        312,   3,   7,   5,   5,   4,   1],
       [  4,   7,   3,   1,   1,   2,  22,   5,   4,   3,   2,   2,   9,
          9, 301,   3,   4,   7,   5,   0],
       [ 14,   3,   2,   1,   0,   2,  15,   0,   1,   4,   2,   1,   2,
          6,   6, 325,   2,   6,   2,   4],
       [  3,   2,   4,   3,   2,   2,  16,   3,   7,   4,   1,  16,   0,
          6,   6,  12, 253,   9,   9,   6],
       [ 13,   2,   2,   0,   0,   2,   8,   1,   5,   6,   1,   3,   1,
          2,   1,   7,  10, 308,   4,   0],
       [ 11,   0,   0,   2,   1,   2,   7,   5,   6,   4,   7,   9,   1,
          8,  13,   5,  98,  20, 109,   2],
       [ 32,   4,   2,   1,   1,   1,  11,   6,   5,   1,   4,   2,   1,
         13,   4,  85,  28,  15,   4,  31]])

In [ ]: