In [1]:

    
# Import Package



In [50]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

1. Make Sample Data



In [73]:

    
# two subject(X0, X1), three student(y0, y1, y2)
# predict who student(y0, y1, y2) as two subject point

# student 0
X00 = scipy.stats.norm(80, 1).rvs(100)
X01 = scipy.stats.norm(70, 1).rvs(100)
X0 = np.vstack([X00, X01]).T

# student 1
X10 = scipy.stats.norm(70, 5).rvs(200)
X11 = scipy.stats.norm(90, 5).rvs(200)
X1 = np.vstack([X10, X11]).T

# student 2
X20 = scipy.stats.norm(70, 1).rvs(300)
X21 = scipy.stats.norm(80, 5).rvs(300)
X2 = np.vstack([X20, X21]).T

# merge student data as vertical
X = np.vstack([X0, X1, X2])

# category
y0 = np.zeros(100)
y1 = np.ones(200)
y2 = np.ones(300)*2
y = np.hstack([y0, y1, y2])[:, np.newaxis]

# merge for make pandas dataframe
m = np.hstack([X, y])

X.shape, y.shape, m.shape









    Out[73]:





((600, 2), (600, 1), (600, 3))



In [74]:

    
df = pd.DataFrame(m, columns=["subject_0","subject_1","student"])
df.tail()

2. Draw Pairplot



In [75]:

    
# draw pairplot on more 2 level 
sns.pairplot(df, hue="student")









    Out[75]:





<seaborn.axisgrid.PairGrid at 0x109a9e6a0>



In [76]:

    
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.scatter(df.subject_0, df.subject_1, c=y, s=50, cmap=cmap)









    Out[76]:





<matplotlib.collections.PathCollection at 0x1171797b8>

3. Make Gaussian Naive Bayes Model



In [77]:

    
y = np.hstack([y0, y1, y2]) # change y shape (600,1) -> (600,) because of DataConversionWarning
clf_norm = GaussianNB().fit(X, y)



In [78]:

    
clf_norm.classes_









    Out[78]:





array([ 0.,  1.,  2.])



In [79]:

    
clf_norm.class_count_









    Out[79]:





array([ 100.,  200.,  300.])



In [80]:

    
clf_norm.class_prior_









    Out[80]:





array([ 0.16666667,  0.33333333,  0.5       ])



In [81]:

    
clf_norm.theta_









    Out[81]:





array([[ 79.91488301,  70.01215615],
       [ 69.24308549,  89.51594467],
       [ 70.07015894,  80.09972626]])



In [87]:

    
# sigma is square of sample data's distributhon
clf_norm.sigma_









    Out[87]:





array([[  0.91787546,   1.24542101],
       [ 22.84219207,  22.3139726 ],
       [  1.06906486,  28.26483329]])

4. Predict



In [88]:

    
y_pred = clf_norm.predict(X)



In [89]:

    
confusion_matrix(y, y_pred)









    Out[89]:





array([[100,   0,   0],
       [  0, 174,  26],
       [  0,  17, 283]])



In [90]:

    
print(classification_report(y, y_pred))

# this model performance is good!









    



             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       100
        1.0       0.91      0.87      0.89       200
        2.0       0.92      0.94      0.93       300

avg / total       0.93      0.93      0.93       600



In [91]:

    
xmin, xmax = 50, 90
ymin, ymax = 60, 110
XX, YY = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)/1000), np.arange(ymin, ymax, (ymax-ymin)/1000))
ZZ = np.reshape(clf_norm.predict(np.array([XX.ravel(), YY.ravel()]).T), XX.shape)
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.contourf(XX, YY, ZZ, cmap=cmap, alpha=0.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=cmap)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)









    Out[91]:





(60, 110)



In [ ]:

	subject_0	subject_1	student
595	66.821704	75.952786	2.0
596	70.046716	77.303721	2.0
597	69.100753	84.241568	2.0
598	69.208940	79.732807	2.0
599	70.587497	76.716414	2.0