Import Package



In [2]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB

1. Make Sample Data



In [55]:

    
# spam division
# making sample data values are 0 or 1
# 4 level sample data(X) ( word1, word2, word3, word4 ), category(Y) 0: no spam, 1: spam
X = np.random.randint(2, size=(10, 4))
y = np.array([0,0,0,0,1,1,1,1,1,1])

# make pandas dataframe
df = pd.DataFrame(X, columns=["word1","word2","word3","word4"])
df["result"] = y
df

2. Make Bernoulli Naive Bayes Model



In [56]:

    
clf_bern = BernoulliNB().fit(df.ix[:,:-1], df.ix[:,-1])



In [57]:

    
# the number of categories(y)
clf_bern.classes_









    Out[57]:





array([0, 1])



In [58]:

    
# the number of (result 0, result 1) samples
clf_bern.class_count_









    Out[58]:





array([ 4.,  6.])



In [59]:

    
# the rate of (result 0, result 1) samples 
np.exp(clf_bern.class_log_prior_)









    Out[59]:





array([ 0.4,  0.6])



In [60]:

    
# count of columns of each category sample
fc = clf_bern.feature_count_
fc









    Out[60]:





array([[ 2.,  4.,  2.,  3.],
       [ 4.,  0.,  3.,  3.]])



In [61]:

    
fc / np.repeat(clf_bern.class_count_[:, np.newaxis], 4, axis=1)









    Out[61]:





array([[ 0.5       ,  1.        ,  0.5       ,  0.75      ],
       [ 0.66666667,  0.        ,  0.5       ,  0.5       ]])



In [62]:

    
theta = np.exp(clf_bern.feature_log_prob_)
theta









    Out[62]:





array([[ 0.5       ,  0.83333333,  0.5       ,  0.66666667],
       [ 0.625     ,  0.125     ,  0.5       ,  0.5       ]])

3. Predict



In [63]:

    
x_new = np.array([1, 1, 0, 0])



In [66]:

    
clf_bern.predict_proba([x_new])
# the rate of result 0(no spam) is 0.703...









    Out[66]:





array([[ 0.7032967,  0.2967033]])

Code of bernouilli fomula : $ P(x_i \mid y = C_k) = \theta_k^x (1-\theta_k)^{(1-x_i)} $



In [67]:

    
p = ((theta**x_new)*(1-theta)**(1-x_new)).prod(axis=1)*np.exp(clf_bern.class_log_prior_)
p / p.sum()









    Out[67]:





array([ 0.7032967,  0.2967033])



In [ ]:

	word1	word2	word3	word4	result
0	0	1	0	1	0
1	1	1	1	1	0
2	0	1	0	1	0
3	1	1	1	0	0
4	1	0	0	1	1
5	1	0	1	0	1
6	1	0	0	0	1
7	0	0	1	1	1
8	0	0	0	1	1
9	1	0	1	0	1

	word1	word2	word3	word4	result
0	0	1	0	1	0
1	1	1	1	1	0
2	0	1	0	1	0
3	1	1	1	0	0
4	1	0	0	1	1
5	1	0	1	0	1
6	1	0	0	0	1
7	0	0	1	1	1
8	0	0	0	1	1
9	1	0	1	0	1

	word1	word2	word3	word4	result
0	0	1	0	1	0
1	1	1	1	1	0
2	0	1	0	1	0
3	1	1	1	0	0
4	1	0	0	1	1
5	1	0	1	0	1
6	1	0	0	0	1
7	0	0	1	1	1
8	0	0	0	1	1
9	1	0	1	0	1