Import Package


In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB

1. Make Sample Data


In [55]:
# spam division
# making sample data values are 0 or 1
# 4 level sample data(X) ( word1, word2, word3, word4 ), category(Y) 0: no spam, 1: spam
X = np.random.randint(2, size=(10, 4))
y = np.array([0,0,0,0,1,1,1,1,1,1])

# make pandas dataframe
df = pd.DataFrame(X, columns=["word1","word2","word3","word4"])
df["result"] = y
df


Out[55]:
word1 word2 word3 word4 result
0 0 1 0 1 0
1 1 1 1 1 0
2 0 1 0 1 0
3 1 1 1 0 0
4 1 0 0 1 1
5 1 0 1 0 1
6 1 0 0 0 1
7 0 0 1 1 1
8 0 0 0 1 1
9 1 0 1 0 1

2. Make Bernoulli Naive Bayes Model


In [56]:
clf_bern = BernoulliNB().fit(df.ix[:,:-1], df.ix[:,-1])

In [57]:
# the number of categories(y)
clf_bern.classes_


Out[57]:
array([0, 1])

In [58]:
# the number of (result 0, result 1) samples
clf_bern.class_count_


Out[58]:
array([ 4.,  6.])

In [59]:
# the rate of (result 0, result 1) samples 
np.exp(clf_bern.class_log_prior_)


Out[59]:
array([ 0.4,  0.6])

In [60]:
# count of columns of each category sample
fc = clf_bern.feature_count_
fc


Out[60]:
array([[ 2.,  4.,  2.,  3.],
       [ 4.,  0.,  3.,  3.]])

In [61]:
fc / np.repeat(clf_bern.class_count_[:, np.newaxis], 4, axis=1)


Out[61]:
array([[ 0.5       ,  1.        ,  0.5       ,  0.75      ],
       [ 0.66666667,  0.        ,  0.5       ,  0.5       ]])

In [62]:
theta = np.exp(clf_bern.feature_log_prob_)
theta


Out[62]:
array([[ 0.5       ,  0.83333333,  0.5       ,  0.66666667],
       [ 0.625     ,  0.125     ,  0.5       ,  0.5       ]])

3. Predict


In [63]:
x_new = np.array([1, 1, 0, 0])

In [66]:
clf_bern.predict_proba([x_new])
# the rate of result 0(no spam) is 0.703...


Out[66]:
array([[ 0.7032967,  0.2967033]])

Code of bernouilli fomula : $ P(x_i \mid y = C_k) = \theta_k^x (1-\theta_k)^{(1-x_i)} $


In [67]:
p = ((theta**x_new)*(1-theta)**(1-x_new)).prod(axis=1)*np.exp(clf_bern.class_log_prior_)
p / p.sum()


Out[67]:
array([ 0.7032967,  0.2967033])

In [ ]: