Import Package



In [3]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB

1. Make Sample Data



In [12]:

    
# three dice division
# throw dice each 10 times
X = np.random.randint(2, size=(30, 6))

y0 = np.zeros(10)
y1 = np.ones(10)
y2 = np.ones(10)*2
y = np.hstack([y0, y1, y2])

# make pandas dataframe
df = pd.DataFrame(X, columns=["number_1","number_2","number_3","number_4","number_5","number_6"])
df["result"] = y
df.head()

2. Make Bernoulli Naive Bayes Model



In [13]:

    
clf_mult = MultinomialNB().fit(df.ix[:,:-1], df.ix[:,-1])



In [14]:

    
# three dice
clf_mult.classes_









    Out[14]:





array([ 0.,  1.,  2.])



In [16]:

    
# the number of throwing each dice  
clf_mult.class_count_









    Out[16]:





array([ 10.,  10.,  10.])



In [18]:

    
# the number of each dice number
fc = clf_mult.feature_count_
fc









    Out[18]:





array([[ 4.,  6.,  3.,  6.,  4.,  5.],
       [ 5.,  2.,  3.,  5.,  9.,  3.],
       [ 4.,  4.,  5.,  5.,  4.,  6.]])



In [20]:

    
fc / np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1)









    Out[20]:





array([[ 0.14285714,  0.21428571,  0.10714286,  0.21428571,  0.14285714,
         0.17857143],
       [ 0.18518519,  0.07407407,  0.11111111,  0.18518519,  0.33333333,
         0.11111111],
       [ 0.14285714,  0.14285714,  0.17857143,  0.17857143,  0.14285714,
         0.21428571]])

smoothing : $ \hat{\theta} = \frac{ N_{i} + \alpha}{N + \alpha n} $



In [22]:

    
# plus one each elements
clf_mult.alpha









    Out[22]:





1.0

multinomial fomula : $ P(x_1, \ldots, x_n \mid y = C_k) = \prod_i \theta_k^{x_i}$



In [24]:

    
(fc + clf_mult.alpha) / (np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1) + clf_mult.alpha * X.shape[1])









    Out[24]:





array([[ 0.14705882,  0.20588235,  0.11764706,  0.20588235,  0.14705882,
         0.17647059],
       [ 0.18181818,  0.09090909,  0.12121212,  0.18181818,  0.3030303 ,
         0.12121212],
       [ 0.14705882,  0.14705882,  0.17647059,  0.17647059,  0.14705882,
         0.20588235]])



In [26]:

    
theta = np.exp(clf_mult.feature_log_prob_)
theta









    Out[26]:





array([[ 0.14705882,  0.20588235,  0.11764706,  0.20588235,  0.14705882,
         0.17647059],
       [ 0.18181818,  0.09090909,  0.12121212,  0.18181818,  0.3030303 ,
         0.12121212],
       [ 0.14705882,  0.14705882,  0.17647059,  0.17647059,  0.14705882,
         0.20588235]])

3. Predict



In [29]:

    
# predict classification of (number_1 : 5, number_2 : 2, number_3 : 11, number_4 : 3, number_5 : 1, number_6 : 6)
x_new = np.array([5, 2, 11, 3, 1, 6])
clf_mult.predict_proba([x_new])

# the rate of third dice is 0.9843....









    Out[29]:





array([[ 0.01404581,  0.00163742,  0.98431677]])



In [38]:

    
x_new = np.array([18, 24, 35, 24, 51, 13])
clf_mult.predict_proba([x_new])

# the rate of second dice is 0.999









    Out[38]:





array([[  6.46122881e-07,   9.99945648e-01,   5.37058563e-05]])