In [4]:
# This is the Jupyter code for Text Classification with Naive Bayers
%pylab inline
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')


Populating the interactive namespace from numpy and matplotlib

In [8]:
print news.keys()


['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']

In [9]:
print news.description


the 20 newsgroups by date dataset

In [11]:
print len(news.data)
print len(news.target_names)


18846
20

In [12]:
print news.target_names


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

In [14]:
# have a look for one instance
print news.data[0]
print news.target_names[news.target[0]]


From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


rec.sport.hockey

In [15]:
# Split the data into train and test
SPLIT_PREC = 0.75
split_size = int(len(news.data) * SPLIT_PREC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

In [20]:
# there are three different classes can transform text into numeric features
# - CountVectorizer
# - HashingVectorizer
# - TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

# 1st classifier with CountVectorizer
clf_1 = Pipeline([
         ('vect', CountVectorizer()),
         ('clf', MultinomialNB()),
    ])

# 2nd classifier with HashingVectorizer
clf_2 = Pipeline([
         ('vect', HashingVectorizer(non_negative=True)),
         ('clf', MultinomialNB()),
    ])

# 3rd classifier with TfidfVectorizer
clf_3 = Pipeline([
         ('vect', TfidfVectorizer()),
         ('clf', MultinomialNB()),
    ])

In [21]:
print clf_1


Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [22]:
print clf_2


Pipeline(steps=[('vect', HashingVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
         dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
         lowercase=True, n_features=1048576, ngram_range=(1, 1),
         non_negative=True, norm=u'l2', preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
         tokenizer=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [23]:
print clf_3


Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [26]:
# define a validation function
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    scores = cross_val_score(clf, X, y, cv = cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))

In [27]:
# run the validation
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)


[ 0.85782493  0.85725657  0.84664367  0.85911382  0.8458477 ]
Mean score: 0.853 (+/-0.003)
[ 0.75543767  0.77659857  0.77049615  0.78508888  0.76200584]
Mean score: 0.770 (+/-0.005)
[ 0.84482759  0.85990979  0.84558238  0.85990979  0.84213319]
Mean score: 0.850 (+/-0.004)

In [28]:
# create another clf with regular expression to limit the words being counted
clf_4 = Pipeline([
         ('vect', TfidfVectorizer(
            token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",)),
         ('clf', MultinomialNB()),
    ])
evaluate_cross_validation(clf_4, news.data, news.target, 5)


[ 0.86100796  0.8718493   0.86203237  0.87291059  0.8588485 ]
Mean score: 0.865 (+/-0.003)

In [32]:
# apply stop words
# TODO: need to download the stopwords_en.txt
def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

# create the clf
clf_5 = Pipeline([
         ('vect', TfidfVectorizer(
            stop_words=get_stop_words(),
            token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",)),
         ('clf', MultinomialNB()),
    ])
evaluate_cross_validation(clf_5, news.data, news.target, 5)


[ 0.87692308  0.89015654  0.879013    0.88829928  0.87874768]
Mean score: 0.883 (+/-0.003)

In [ ]: