In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
from sklearn.datasets import make_blobs
x, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(x[:, 0], x[:, 1], s=50, c=y, cmap='RdBu')


Out[3]:
<matplotlib.collections.PathCollection at 0x119151f10>

In [9]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x, y)


Out[9]:
GaussianNB(priors=None)

In [11]:
rng = np.random.RandomState(0)
xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(xnew)
plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(xnew[:, 0], xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim)


Out[11]:
(-6.0, 8.0, -14.0, 4.0)

In [12]:
yprob = model.predict_proba(xnew)
yprob[-8:].round(2)


Out[12]:
array([[ 0.89,  0.11],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 0.  ,  1.  ],
       [ 0.15,  0.85]])

In [13]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names


Out[13]:
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [14]:
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [15]:
print(train.data[5])


From: dmcgee@uluhe.soest.hawaii.edu (Don McGee)
Subject: Federal Hearing
Originator: dmcgee@uluhe
Organization: School of Ocean and Earth Science and Technology
Distribution: usa
Lines: 10


Fact or rumor....?  Madalyn Murray O'Hare an atheist who eliminated the
use of the bible reading and prayer in public schools 15 years ago is now
going to appear before the FCC with a petition to stop the reading of the
Gospel on the airways of America.  And she is also campaigning to remove
Christmas programs, songs, etc from the public schools.  If it is true
then mail to Federal Communications Commission 1919 H Street Washington DC
20054 expressing your opposition to her request.  Reference Petition number

2493.


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB()).fit(train.data, train.target)
labels = model.predict(test.data)

In [17]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')


Out[17]:
<matplotlib.text.Text at 0x119d9f650>

In [18]:
def predict_category(s, train=train, model=model): 
    pred = model.predict([s])
    return train.target_names[pred[0]]

In [19]:
predict_category('sending a payload to the ISS')


Out[19]:
'sci.space'

In [20]:
predict_category('discussing islam vs atheism')


Out[20]:
'soc.religion.christian'

In [21]:
predict_category('determining the screen resolution')


Out[21]:
'comp.graphics'

In [ ]: