In [1]:
# Naive Bayes classification testing

%pylab  inline

# SciKit Learn's newsgroup dataset
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

print(type(news.data), type(news.target), type(news.target_names))

print(news.target_names)

print(len(news.data))

print(len(news.target))

# Check first dataset entry
print(news.data[0])
print(news.target[0], news.target_names[news.target[0]])


Populating the interactive namespace from numpy and matplotlib
(<type 'list'>, <type 'numpy.ndarray'>, <type 'list'>)
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
18846
18846
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


(10, 'rec.sport.hockey')

In [2]:
# Convert the text-based data into numeric data

SPLIT_PERC = 0.75
split_size = int(len(news.data) * SPLIT_PERC)

X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

print(len(X_train))
print(len(y_train))


14134
14134

In [3]:
# Naive Bayes training

import numpy as np

from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print(scores)
    print("Mean score {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores)))

    
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

clf_1 = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', MultinomialNB()),
        ])
clf_2 = Pipeline([
        ('vect', HashingVectorizer(non_negative=True)),
        ('clf', MultinomialNB()),
        ])
clf_3 = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', MultinomialNB()),
        ])

    
    
clfs = [clf_1, clf_2, clf_3]

for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)


[ 0.85782493  0.85725657  0.84664367  0.85911382  0.8458477 ]
Mean score 0.853 (+/-0.003)
[ 0.75543767  0.77659857  0.77049615  0.78508888  0.76200584]
Mean score 0.770 (+/-0.005)
[ 0.84482759  0.85990979  0.84558238  0.85990979  0.84213319]
Mean score 0.850 (+/-0.004)

In [4]:
# New classifier with different regex to split the words
clf_4 = Pipeline([
        ('vect', TfidfVectorizer(
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
                )),
        ('clf', MultinomialNB()),
        ])

evaluate_cross_validation(clf_4, news.data, news.target, 5)


[ 0.86100796  0.8718493   0.86203237  0.87291059  0.8588485 ]
Mean score 0.865 (+/-0.003)

In [5]:
# Let's use stop_words

def get_stop_words():
    result = set()
    with open('stopwords_en.txt', 'r') as in_file:
        for line in in_file.readlines():
            result.add(line.strip())
            
    return result

print(get_stop_words())


set(['a', 'the', 'an'])

In [6]:
# Try another classifier with get_stop_words now
clf_5 = Pipeline([
        ('vect', TfidfVectorizer(
                stop_words=get_stop_words(),
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9\_\.]+\b",
                )),
        ('clf', MultinomialNB()),
        ])

evaluate_cross_validation(clf_5, news.data, news.target, 5)


[ 0.86419098  0.87476784  0.86388963  0.87397188  0.85990979]
Mean score 0.867 (+/-0.003)

In [7]:
# Testing with a different alpha parameter for MultinomialNB

clf_7 = Pipeline([
        ('vect', TfidfVectorizer(
                stop_words=get_stop_words(),
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9\_\.]+\b",
                )),
        ('clf', MultinomialNB(alpha=0.01)),
        ])

evaluate_cross_validation(clf_7, news.data, news.target, 5)


[ 0.9193634   0.92066861  0.91748474  0.9241178   0.91775007]
Mean score 0.920 (+/-0.001)

In [9]:
# Let's evaluate the performance

from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    
    print("Accuracy on training set:")
    print(clf.score(X_train, y_train))
    print("Accuracy on testing set:")
    print(clf.score(X_test, y_test))
    y_pred = clf.predict(X_test)
    
    print("Classification Report:")
    print(metrics.classification_report(y_test, y_pred))
    
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test, y_pred))
    
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)


Accuracy on training set:
0.996745436536
Accuracy on testing set:
0.915322580645
Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.88      0.91       216
          1       0.84      0.84      0.84       246
          2       0.91      0.83      0.87       274
          3       0.80      0.86      0.83       235
          4       0.87      0.90      0.89       231
          5       0.88      0.91      0.89       225
          6       0.88      0.80      0.84       248
          7       0.93      0.93      0.93       275
          8       0.96      0.98      0.97       226
          9       0.97      0.94      0.96       250
         10       0.97      1.00      0.98       257
         11       0.96      0.98      0.97       261
         12       0.90      0.91      0.90       216
         13       0.94      0.95      0.95       257
         14       0.94      0.96      0.95       246
         15       0.90      0.97      0.93       234
         16       0.90      0.97      0.94       218
         17       0.97      0.99      0.98       236
         18       0.95      0.90      0.93       213
         19       0.87      0.76      0.81       148

avg / total       0.92      0.92      0.91      4712

Confusion Matrix:
[[190   0   0   0   1   0   0   0   0   1   0   0   0   1   0   9   2   0
    0  12]
 [  0 206   5   4   3  14   4   0   0   0   0   1   3   2   3   0   0   1
    0   0]
 [  0  12 227  24   1   5   1   0   1   0   0   0   0   0   1   0   1   0
    1   0]
 [  0   5   7 202  10   3   4   0   0   0   0   0   3   0   1   0   0   0
    0   0]
 [  0   2   3   5 208   1   4   0   0   0   2   0   5   0   1   0   0   0
    0   0]
 [  0   9   3   2   1 204   0   1   1   0   0   0   0   2   1   0   0   1
    0   0]
 [  0   2   3   9   6   0 198  14   0   2   1   2   6   2   2   0   0   1
    0   0]
 [  0   3   0   1   1   0   7 255   4   1   0   0   0   1   0   0   2   0
    0   0]
 [  0   0   0   0   0   1   1   2 221   0   0   0   0   1   0   0   0   0
    0   0]
 [  0   1   0   0   0   0   1   0   2 236   5   0   1   2   0   1   1   0
    0   0]
 [  0   0   0   1   0   0   0   0   0   0 256   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   1   0   0   0   0   0 255   0   1   0   0   3   0
    1   0]
 [  0   1   0   2   5   1   3   1   0   2   1   1 196   2   1   0   0   0
    0   0]
 [  0   1   0   1   1   0   0   0   0   0   0   2   2 245   3   0   1   0
    0   1]
 [  0   3   0   0   1   0   1   0   0   0   0   0   0   1 237   0   1   0
    1   1]
 [  1   0   1   2   0   0   0   1   0   0   0   1   1   0   1 226   0   0
    0   0]
 [  0   0   1   0   0   0   1   0   1   0   0   1   0   0   0   0 212   0
    2   0]
 [  0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 234
    1   0]
 [  1   0   0   0   0   0   1   0   0   0   0   2   1   1   0   1   7   4
  192   3]
 [  9   0   0   0   0   1   0   0   0   1   0   0   0   0   0  14   5   1
    4 113]]

In [10]:
# Look inside the vectorizer to see which tokens were used to create our dictionary
print(len(clf_7.named_steps['vect'].get_feature_names()))


144440