Get Data



In [20]:

    
from sklearn.datasets import fetch_20newsgroups
# get clean corpus: removing headers, footers, quotes
corpus = fetch_20newsgroups(subset='train', shuffle=True,
                            remove=('headers', 'footers', 'quotes'))



In [21]:

    
#explore the corpus structure
print corpus.keys()
print len(corpus.data)









    



['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']
11314



In [22]:

    
# see which categories exist
print corpus.target_names









    



['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']



In [23]:

    
# see an example of document
print corpus.data[0]









    



I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Data Preprocess



In [45]:

    
# Extracting numerical features vectors from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
corpus_train_counts = count_vect.fit_transform(corpus.data)
corpus_train_counts.shape









    Out[45]:





(11314, 101631)



In [46]:

    
# Count: until now we do just counting of words - issue: gives more weight to longer documents than shorter documents
# TF: To avoid this, we can use frequency (TF - Term Frequencies) i.e. #count(word) / #Total words, in each document.
# TF-IDF: We can reduce the weight of more common words like (the, is, an etc.) which occurs in all document. This is called as TF-IDF i.e Term Frequency times inverse document frequency.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
corpus_train_tfidf = tfidf_transformer.fit_transform(corpus_train_counts)
corpus_train_tfidf.shape









    Out[46]:





(11314, 101631)

Train ML



In [50]:

    
print set(corpus.target)
print len(corpus.target_names)









    



set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
20



In [79]:

    
# train and perform the preprocessing steps in the same pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

Predict ML



In [82]:

    
# get data
corpus_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))

# run pipeline 
predicted = pipeline_fit.predict(corpus_test.data)

Evaluate Performance



In [81]:

    
# check accuracy
import numpy as np
np.mean(predicted == corpus_test.target)









    Out[81]:





0.60621348911311734

Baselines



In [116]:

    
# The dummy classifier gives you a measure of "baseline" performance--i.e. 
# the success rate one should expect to achieve even if simply guessing.
from sklearn.dummy import DummyClassifier
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', DummyClassifier())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# check accuracy
np.mean(predicted == corpus_test.target)









    Out[116]:





0.050318640467339353

Improvements

1. removing stopwords



In [85]:

    
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

# run pipeline 
predicted = pipeline_fit.predict(corpus_test.data)

# check accuracy
np.mean(predicted == corpus_test.target)









    Out[85]:





0.67790759426447156



In [86]:

    
# Looks like just by removing stop words we got some small performance improvement.

2. Stemming



In [93]:

    
# create stemmed version of countvectorizer
import nltk
#nltk.download()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

# run pipeline in training data
pipeline = Pipeline([('vect', stemmed_count_vect), 
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB(fit_prior=False))])
pipeline_fit = pipeline.fit(corpus.data, corpus.target)

# predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)









    Out[93]:





0.67843866171003719

4. Try other models



In [112]:

    
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf-knn', KNeighborsClassifier(n_neighbors=500))])

pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)









    Out[112]:





0.40560276155071695

5. More advanced models SVM



In [114]:

    
# A linear support vector machine (SVM) is widely regarded as one of the best text classification algorithms 
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

pipeline_fit = pipeline.fit(corpus.data, corpus.target)

#predict
predicted = pipeline_fit.predict(corpus_test.data)

# evaluate
np.mean(predicted == corpus_test.target)









    Out[114]:





0.68361656930430159

5. Grid Search



In [92]:

    
# Perform hyperparameter tunnings in case one uses algorithms with tunable parameters

More Evaluation Metrics



In [115]:

    
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(corpus_test.target, predicted)









    Out[115]:





0.66652234784910114



In [96]:

    
from sklearn import metrics
print(metrics.classification_report(
    corpus_test.target,
    predicted,
    target_names=corpus_test.target_names))









    



                          precision    recall  f1-score   support

             alt.atheism       0.56      0.42      0.48       319
           comp.graphics       0.69      0.67      0.68       389
 comp.os.ms-windows.misc       0.67      0.60      0.63       394
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       392
   comp.sys.mac.hardware       0.76      0.68      0.72       385
          comp.windows.x       0.74      0.71      0.73       395
            misc.forsale       0.48      0.85      0.61       390
               rec.autos       0.79      0.70      0.74       396
         rec.motorcycles       0.73      0.77      0.75       398
      rec.sport.baseball       0.82      0.78      0.80       397
        rec.sport.hockey       0.82      0.91      0.86       399
               sci.crypt       0.71      0.74      0.73       396
         sci.electronics       0.67      0.49      0.57       393
                 sci.med       0.76      0.79      0.78       396
               sci.space       0.70      0.76      0.73       394
  soc.religion.christian       0.61      0.82      0.70       398
      talk.politics.guns       0.56      0.70      0.62       364
   talk.politics.mideast       0.74      0.82      0.78       376
      talk.politics.misc       0.70      0.35      0.47       310
      talk.religion.misc       0.52      0.12      0.20       251

             avg / total       0.69      0.68      0.67      7532



In [98]:

    
metrics.confusion_matrix(corpus_test.target, predicted)









    Out[98]:





array([[135,   0,   2,   2,   0,   2,  13,   4,   9,   5,   4,   4,   4,
          8,  20,  73,   6,  20,   0,   8],
       [  7, 262,  20,  12,   7,  22,  12,   1,   5,   3,   0,  14,   5,
          2,  13,   1,   1,   2,   0,   0],
       [  2,  15, 238,  34,  18,  23,  18,   1,   3,   4,   2,   4,   1,
          8,  11,   1,   4,   1,   4,   2],
       [  0,  14,  26, 256,  21,   9,  25,   2,   1,   1,   2,   9,  21,
          1,   1,   0,   0,   2,   1,   0],
       [  0,   6,   6,  35, 261,   7,  30,   6,   9,   0,   3,   5,   9,
          2,   4,   1,   1,   0,   0,   0],
       [  0,  37,  33,   4,   6, 282,  14,   0,   2,   1,   0,   6,   3,
          1,   5,   0,   0,   1,   0,   0],
       [  0,   3,   0,  12,   9,   0, 330,   7,   6,   2,   2,   1,   5,
          0,   6,   1,   5,   1,   0,   0],
       [  6,   1,   4,   1,   1,   3,  37, 276,  21,   3,   3,   2,  16,
          2,   5,   0,   8,   5,   2,   0],
       [  2,   0,   1,   1,   1,   0,  24,  16, 307,   4,   0,   1,   5,
          7,   8,   4,   6,   5,   4,   2],
       [  3,   1,   0,   0,   1,   1,  22,   2,   5, 311,  36,   1,   1,
          2,   3,   4,   0,   0,   4,   0],
       [  2,   2,   0,   1,   0,   0,   9,   0,   2,   7, 364,   1,   0,
          2,   2,   1,   4,   0,   0,   2],
       [  3,   5,   5,   3,   4,   6,  20,   1,   3,   6,   1, 294,   5,
          4,   7,   2,  15,   6,   4,   2],
       [  1,  11,   8,  22,   8,  11,  32,  11,  14,   7,   5,  37, 194,
         14,  12,   2,   2,   2,   0,   0],
       [  1,   7,   1,   0,   0,   2,  23,   3,   5,   1,   7,   2,   7,
        312,   3,   7,   5,   5,   4,   1],
       [  4,   7,   3,   1,   1,   2,  22,   5,   4,   3,   2,   2,   9,
          9, 301,   3,   4,   7,   5,   0],
       [ 14,   3,   2,   1,   0,   2,  15,   0,   1,   4,   2,   1,   2,
          6,   6, 325,   2,   6,   2,   4],
       [  3,   2,   4,   3,   2,   2,  16,   3,   7,   4,   1,  16,   0,
          6,   6,  12, 253,   9,   9,   6],
       [ 13,   2,   2,   0,   0,   2,   8,   1,   5,   6,   1,   3,   1,
          2,   1,   7,  10, 308,   4,   0],
       [ 11,   0,   0,   2,   1,   2,   7,   5,   6,   4,   7,   9,   1,
          8,  13,   5,  98,  20, 109,   2],
       [ 32,   4,   2,   1,   1,   1,  11,   6,   5,   1,   4,   2,   1,
         13,   4,  85,  28,  15,   4,  31]])



In [ ]: