Import Package


In [3]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB

1. Make Sample Data


In [12]:
# three dice division
# throw dice each 10 times
X = np.random.randint(2, size=(30, 6))

y0 = np.zeros(10)
y1 = np.ones(10)
y2 = np.ones(10)*2
y = np.hstack([y0, y1, y2])

# make pandas dataframe
df = pd.DataFrame(X, columns=["number_1","number_2","number_3","number_4","number_5","number_6"])
df["result"] = y
df.head()


Out[12]:
number_1 number_2 number_3 number_4 number_5 number_6 result
0 1 1 0 0 1 1 0.0
1 0 0 0 1 0 1 0.0
2 0 0 0 1 1 1 0.0
3 1 0 1 0 0 0 0.0
4 1 0 0 1 0 1 0.0

2. Make Bernoulli Naive Bayes Model


In [13]:
clf_mult = MultinomialNB().fit(df.ix[:,:-1], df.ix[:,-1])

In [14]:
# three dice
clf_mult.classes_


Out[14]:
array([ 0.,  1.,  2.])

In [16]:
# the number of throwing each dice  
clf_mult.class_count_


Out[16]:
array([ 10.,  10.,  10.])

In [18]:
# the number of each dice number
fc = clf_mult.feature_count_
fc


Out[18]:
array([[ 4.,  6.,  3.,  6.,  4.,  5.],
       [ 5.,  2.,  3.,  5.,  9.,  3.],
       [ 4.,  4.,  5.,  5.,  4.,  6.]])

In [20]:
fc / np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1)


Out[20]:
array([[ 0.14285714,  0.21428571,  0.10714286,  0.21428571,  0.14285714,
         0.17857143],
       [ 0.18518519,  0.07407407,  0.11111111,  0.18518519,  0.33333333,
         0.11111111],
       [ 0.14285714,  0.14285714,  0.17857143,  0.17857143,  0.14285714,
         0.21428571]])

smoothing : $ \hat{\theta} = \frac{ N_{i} + \alpha}{N + \alpha n} $


In [22]:
# plus one each elements
clf_mult.alpha


Out[22]:
1.0

multinomial fomula : $ P(x_1, \ldots, x_n \mid y = C_k) = \prod_i \theta_k^{x_i}$


In [24]:
(fc + clf_mult.alpha) / (np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1) + clf_mult.alpha * X.shape[1])


Out[24]:
array([[ 0.14705882,  0.20588235,  0.11764706,  0.20588235,  0.14705882,
         0.17647059],
       [ 0.18181818,  0.09090909,  0.12121212,  0.18181818,  0.3030303 ,
         0.12121212],
       [ 0.14705882,  0.14705882,  0.17647059,  0.17647059,  0.14705882,
         0.20588235]])

In [26]:
theta = np.exp(clf_mult.feature_log_prob_)
theta


Out[26]:
array([[ 0.14705882,  0.20588235,  0.11764706,  0.20588235,  0.14705882,
         0.17647059],
       [ 0.18181818,  0.09090909,  0.12121212,  0.18181818,  0.3030303 ,
         0.12121212],
       [ 0.14705882,  0.14705882,  0.17647059,  0.17647059,  0.14705882,
         0.20588235]])

3. Predict


In [29]:
# predict classification of (number_1 : 5, number_2 : 2, number_3 : 11, number_4 : 3, number_5 : 1, number_6 : 6)
x_new = np.array([5, 2, 11, 3, 1, 6])
clf_mult.predict_proba([x_new])

# the rate of third dice is 0.9843....


Out[29]:
array([[ 0.01404581,  0.00163742,  0.98431677]])

In [38]:
x_new = np.array([18, 24, 35, 24, 51, 13])
clf_mult.predict_proba([x_new])

# the rate of second dice is 0.999


Out[38]:
array([[  6.46122881e-07,   9.99945648e-01,   5.37058563e-05]])