Import Package



In [2]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB

1. Make Sample Data



In [3]:

    
# norm(x, y) x : mean, y : distribution
# X0 mean : -2, distribution 1 , shape : (40,1), category 0
# X1 mean : 2, distribution 1 , shape : (60,1), category 1
X0 = scipy.stats.norm(-2, 1).rvs(40)
X1 = scipy.stats.norm(2, 1).rvs(60)

# [:, np.newaxis] : change shape (ex) (60,) -> (60,1) : we need for stack two matrix
X = np.hstack([X0, X1])[:, np.newaxis] 

# two category : 0 and 1
y0 = np.zeros(40)
y1 = np.ones(60)
y = np.hstack([y0, y1])

X.shape, y.shape









    Out[3]:





((100, 1), (100,))

2. Draw Histogram



In [4]:

    
sns.distplot(X0, rug=True, kde=False, norm_hist=True, label="class 0")
sns.distplot(X1, rug=True, kde=False, norm_hist=True, label="class 1")
plt.legend()









    Out[4]:





<matplotlib.legend.Legend at 0x109615b00>

3. Make Gaussian Naive Bayes Model



In [5]:

    
clf_norm = GaussianNB().fit(X, y)



In [6]:

    
# the number of categories(y)
clf_norm.classes_









    Out[6]:





array([ 0.,  1.])



In [7]:

    
# the number of (X0, X1) samples
clf_norm.class_count_









    Out[7]:





array([ 40.,  60.])



In [8]:

    
# the rate of (X0, X1) samples 
clf_norm.class_prior_









    Out[8]:





array([ 0.4,  0.6])



In [9]:

    
# mean of (X0, X1)
clf_norm.theta_









    Out[9]:





array([[-1.75176994],
       [ 2.08589069]])



In [10]:

    
# distribution of (X0, X1)
clf_norm.sigma_









    Out[10]:





array([[ 0.66957309],
       [ 0.68820588]])

4. Predict



In [76]:

    
# predict classification of -1
x_new = -1
clf_norm.predict_proba([[x_new]])

# the rate of category 0 is 0.97..









    Out[76]:





array([[ 0.97633995,  0.02366005]])



In [81]:

    
# likelihood
px = sp.stats.norm(clf_norm.theta_, np.sqrt(clf_norm.sigma_)).pdf(x_new)
px









    Out[81]:





array([[ 0.30085478],
       [ 0.00486049]])



In [78]:

    
p = px.flatten() * clf_norm.class_prior_
p









    Out[78]:





array([ 0.12034191,  0.0029163 ])



In [79]:

    
p / p.sum()









    Out[79]:





array([ 0.97633995,  0.02366005])