In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
In [55]:
# spam division
# making sample data values are 0 or 1
# 4 level sample data(X) ( word1, word2, word3, word4 ), category(Y) 0: no spam, 1: spam
X = np.random.randint(2, size=(10, 4))
y = np.array([0,0,0,0,1,1,1,1,1,1])
# make pandas dataframe
df = pd.DataFrame(X, columns=["word1","word2","word3","word4"])
df["result"] = y
df
Out[55]:
In [56]:
clf_bern = BernoulliNB().fit(df.ix[:,:-1], df.ix[:,-1])
In [57]:
# the number of categories(y)
clf_bern.classes_
Out[57]:
In [58]:
# the number of (result 0, result 1) samples
clf_bern.class_count_
Out[58]:
In [59]:
# the rate of (result 0, result 1) samples
np.exp(clf_bern.class_log_prior_)
Out[59]:
In [60]:
# count of columns of each category sample
fc = clf_bern.feature_count_
fc
Out[60]:
In [61]:
fc / np.repeat(clf_bern.class_count_[:, np.newaxis], 4, axis=1)
Out[61]:
In [62]:
theta = np.exp(clf_bern.feature_log_prob_)
theta
Out[62]:
In [63]:
x_new = np.array([1, 1, 0, 0])
In [66]:
clf_bern.predict_proba([x_new])
# the rate of result 0(no spam) is 0.703...
Out[66]:
Code of bernouilli fomula : $ P(x_i \mid y = C_k) = \theta_k^x (1-\theta_k)^{(1-x_i)} $
In [67]:
p = ((theta**x_new)*(1-theta)**(1-x_new)).prod(axis=1)*np.exp(clf_bern.class_log_prior_)
p / p.sum()
Out[67]:
In [ ]: