In [3]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
In [12]:
# three dice division
# throw dice each 10 times
X = np.random.randint(2, size=(30, 6))
y0 = np.zeros(10)
y1 = np.ones(10)
y2 = np.ones(10)*2
y = np.hstack([y0, y1, y2])
# make pandas dataframe
df = pd.DataFrame(X, columns=["number_1","number_2","number_3","number_4","number_5","number_6"])
df["result"] = y
df.head()
Out[12]:
In [13]:
clf_mult = MultinomialNB().fit(df.ix[:,:-1], df.ix[:,-1])
In [14]:
# three dice
clf_mult.classes_
Out[14]:
In [16]:
# the number of throwing each dice
clf_mult.class_count_
Out[16]:
In [18]:
# the number of each dice number
fc = clf_mult.feature_count_
fc
Out[18]:
In [20]:
fc / np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1)
Out[20]:
smoothing : $ \hat{\theta} = \frac{ N_{i} + \alpha}{N + \alpha n} $
In [22]:
# plus one each elements
clf_mult.alpha
Out[22]:
multinomial fomula : $ P(x_1, \ldots, x_n \mid y = C_k) = \prod_i \theta_k^{x_i}$
In [24]:
(fc + clf_mult.alpha) / (np.repeat(fc.sum(axis=1)[:, np.newaxis], 6, axis=1) + clf_mult.alpha * X.shape[1])
Out[24]:
In [26]:
theta = np.exp(clf_mult.feature_log_prob_)
theta
Out[26]:
In [29]:
# predict classification of (number_1 : 5, number_2 : 2, number_3 : 11, number_4 : 3, number_5 : 1, number_6 : 6)
x_new = np.array([5, 2, 11, 3, 1, 6])
clf_mult.predict_proba([x_new])
# the rate of third dice is 0.9843....
Out[29]:
In [38]:
x_new = np.array([18, 24, 35, 24, 51, 13])
clf_mult.predict_proba([x_new])
# the rate of second dice is 0.999
Out[38]: