In [1]:
""" The goal is to classify dataset, based on a set of labels.
Bayesian statistics are used to do this,
along with various "naive" assumptions about how
the data is distributed.
"""
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
""" Gaussian Naive Bayes
"""
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
Out[2]:
In [3]:
""" Use the `GaussianNB` estimator
"""
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
""" Now let's generate some new data and predict the label:
"""
rng = np.random.RandomState(0)
Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(Xnew)
""" Now we can plot this new data to get an idea of where the decision boundary is:
"""
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim)
Out[3]:
In [4]:
""" But Bayesian statistics also allow for probabilistic labeling:
"""
yprob = model.predict_proba(Xnew)
yprob[-8:].round(2)
Out[4]:
In [5]:
""" Multinomial Naive Bayes
This will just be a quick whirl wind example in:
Text Classification
"""
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names
Out[5]:
In [6]:
categories = ['talk.religion.misc', 'soc.religion.christian',
'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
print(train.data[5])
In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train.data, train.target)
labels = model.predict(test.data)
def predict_category(s, train=train, model=model):
pred = model.predict([s])
return train.target_names[pred[0]]
print(predict_category('sending a payload to the ISS'))
print(predict_category('discussing islam vs atheism'))
print(predict_category('determining the screen resolution'))
In [11]:
""" When to use Naive Bayes
Naive Bayes is a very simple classification option.
* When the naive assumptions actually match the data (very rare in practice)
* For very well-separated categories, when model complexity is less important
* For very high-dimensional data, when model complexity is less important
"""
print('')
In [ ]: