The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
In [15]:
In [2]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.extmath import density
from pprint import pprint
def classifier(clf,X_train,y_train,X_test,feature_names,categories):
# Adapted from below
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
pred_prob = clf.predict_proba(X_test)
print("top 10 keywords per class:")
for i, category in enumerate(categories):
top10 = np.argsort(clf.coef_[i])[-10:]
print("{0}: {1}".format(category, " ".join(feature_names[top10])))
print()
return pred_prob
from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
print('data loaded')
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
categories = data_train.target_names
print(categories)
print()
pprint(data_train.data[:2])
pprint(data_train.target[:2])
print()
y_train = data_train.target
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
feature_names = np.array(vectorizer.get_feature_names())
pred_prob = classifier(MultinomialNB(alpha=.01),X_train,y_train,X_test,feature_names,categories)
In [3]:
#Some options for nice printing
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
print(['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'])
pred_prob
Out[3]: