In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
In [3]:
from sklearn.datasets import make_blobs
x, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(x[:, 0], x[:, 1], s=50, c=y, cmap='RdBu')
Out[3]:
In [9]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x, y)
Out[9]:
In [11]:
rng = np.random.RandomState(0)
xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(xnew)
plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(xnew[:, 0], xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim)
Out[11]:
In [12]:
yprob = model.predict_proba(xnew)
yprob[-8:].round(2)
Out[12]:
In [13]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names
Out[13]:
In [14]:
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
In [15]:
print(train.data[5])
In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB()).fit(train.data, train.target)
labels = model.predict(test.data)
In [17]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
Out[17]:
In [18]:
def predict_category(s, train=train, model=model):
pred = model.predict([s])
return train.target_names[pred[0]]
In [19]:
predict_category('sending a payload to the ISS')
Out[19]:
In [20]:
predict_category('discussing islam vs atheism')
Out[20]:
In [21]:
predict_category('determining the screen resolution')
Out[21]:
In [ ]: