notebook.community

Edit and run



In [1]:

    
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_color_codes()

%matplotlib inline
%config InlineBackend.figure_format='png'



In [2]:

    
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',    
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_









    Out[2]:





{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'last': 4,
 'one': 5,
 'second': 6,
 'the': 7,
 'third': 8,
 'this': 9}



In [3]:

    
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)



In [4]:

    
vect.vocabulary_, vect.stop_words_









    Out[4]:





({'document': 0, 'first': 1, 'is': 2, 'this': 3},
 {'and', 'last', 'one', 'second', 'the', 'third'})



In [5]:

    
vect = CountVectorizer().fit(corpus)



In [8]:

    
vect.vocabulary_









    Out[8]:





{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'last': 4,
 'one': 5,
 'second': 6,
 'the': 7,
 'third': 8,
 'this': 9}



In [7]:

    
vect.transform(corpus).toarray().sum(axis=0)









    Out[7]:





array([1, 4, 2, 3, 1, 1, 2, 5, 1, 3])



In [9]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [10]:

    
tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()









    Out[10]:





array([[ 0.        ,  0.38947624,  0.55775063,  0.4629834 ,  0.        ,
         0.        ,  0.        ,  0.32941651,  0.        ,  0.4629834 ],
       [ 0.        ,  0.24151532,  0.        ,  0.28709733,  0.        ,
         0.        ,  0.85737594,  0.20427211,  0.        ,  0.28709733],
       [ 0.55666851,  0.        ,  0.        ,  0.        ,  0.        ,
         0.55666851,  0.        ,  0.26525553,  0.55666851,  0.        ],
       [ 0.        ,  0.38947624,  0.55775063,  0.4629834 ,  0.        ,
         0.        ,  0.        ,  0.32941651,  0.        ,  0.4629834 ],
       [ 0.        ,  0.45333103,  0.        ,  0.        ,  0.80465933,
         0.        ,  0.        ,  0.38342448,  0.        ,  0.        ]])



In [ ]:



In [193]:

    
X00 = sp.stats.norm(-2, 1).rvs(50)
X01 = sp.stats.norm(+2, 1).rvs(50)

X10 = sp.stats.norm(+3, 2).rvs(50)
X11 = sp.stats.norm(+7, 3).rvs(50)

X20 = sp.stats.norm(+5, 1).rvs(50)
X21 = sp.stats.norm(+3, 2).rvs(50)

X0 = np.vstack([X00,X01]).T
X1 = np.vstack([X10,X11]).T
X2 = np.vstack([X20,X21]).T

X = np.vstack([X0,X1,X2])
X.shape

y0 = np.zeros(50)
y1 = np.ones(50)
y2 = np.ones(50) * 2
y = np.hstack([y0,y1,y2])[:,np.newaxis]
y.shape

Xy = np.hstack([X,y])



In [197]:

    
X_df = pd.DataFrame(Xy, columns=['X0','X1','y'])



In [195]:

    
sns.pairplot(X_df, hue="y")









    Out[195]:





<seaborn.axisgrid.PairGrid at 0x7f92fa901828>



In [201]:

    
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap=cmap)









    Out[201]:





<matplotlib.collections.PathCollection at 0x7f92fb2c8d30>



In [206]:

    
from sklearn.naive_bayes import GaussianNB
clf_norm = GaussianNB().fit(X,y)









    



/home/rrbb/.pyenv/versions/3.5.1/envs/python3/lib/python3.5/site-packages/sklearn/utils/validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)



In [207]:

    
clf_norm.classes_









    Out[207]:





array([ 0.,  1.,  2.])



In [208]:

    
clf_norm.class_count_









    Out[208]:





array([ 50.,  50.,  50.])



In [209]:

    
clf_norm.class_prior_









    Out[209]:





array([ 0.33333333,  0.33333333,  0.33333333])



In [210]:

    
clf_norm.theta_, clf_norm.sigma_









    Out[210]:





(array([[-1.94633003,  2.04934779],
        [ 3.037226  ,  6.91927404],
        [ 4.76635327,  3.59396459]]), array([[ 0.85189353,  0.8359859 ],
        [ 2.83895493,  8.17459234],
        [ 1.24649974,  5.42785848]]))



In [212]:

    
y_predict = clf_norm.predict(X)



In [213]:

    
from sklearn.metrics import confusion_matrix
clf_result = confusion_matrix(y, y_predict)



In [214]:

    
clf_result









    Out[214]:





array([[50,  0,  0],
       [ 0, 40, 10],
       [ 0,  8, 42]])



In [225]:

    
from sklearn.metrics import classification_report
print(classification_report?



In [226]:

    
print(classification_report(y,y_predict))









    



             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        50
        1.0       0.83      0.80      0.82        50
        2.0       0.81      0.84      0.82        50

avg / total       0.88      0.88      0.88       150



In [228]:

    
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis(store_covariances=True).fit(X, y)









    



/home/rrbb/.pyenv/versions/3.5.1/envs/python3/lib/python3.5/site-packages/sklearn/utils/validation.py:515: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)



In [234]:

    
xmin, xmax = -10, 15
ymin, ymax = -10, 15
XX, YY = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)/1000), np.arange(ymin, ymax, (ymax-ymin)/1000))
ZZ = np.reshape(qda.predict(np.array([XX.ravel(), YY.ravel()]).T), XX.shape)
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.contourf(XX, YY, ZZ, cmap=cmap, alpha=0.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=cmap)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.show()



In [41]:

    
X0.mean(), X0.var(), X1.mean(), X1.var()









    Out[41]:





(-1.8619161572303782,
 0.8586128820694926,
 1.8830285850671031,
 1.3066090770986736)



In [31]:

    
from sklearn.naive_bayes import GaussianNB
clf_norm = GaussianNB().fit(X, y)



In [32]:

    
clf_norm.classes_    # 분류 하려는 class









    Out[32]:





array([ 0.,  1.])



In [33]:

    
clf_norm.class_count_ # 0은 몇번 나옴? 1은 몇번 나옴?









    Out[33]:





array([ 40.,  60.])



In [36]:

    
clf_norm.class_prior_  # 사전확률 P(y=0) , P(y=1)









    Out[36]:





array([ 0.4,  0.6])



In [35]:

    
clf_norm.theta_, clf_norm.sigma_  # mean 과 variance









    Out[35]:





(array([[-1.86191616],
        [ 1.88302859]]), array([[ 0.85861289],
        [ 1.30660908]]))



In [43]:

    
# kernel density가 아닌 gaussian curve
xx = np.linspace(-6, 6, 100)
p0 = sp.stats.norm(clf_norm.theta_[0], clf_norm.sigma_[0]).pdf(xx)
p1 = sp.stats.norm(clf_norm.theta_[1], clf_norm.sigma_[1]).pdf(xx)                                                               
sns.distplot(X0, rug=True, kde=False, norm_hist=True, color="r", label="class 0 histogram")
sns.distplot(X1, rug=True, kde=False, norm_hist=True, color="b", label="class 1 histogram")
plt.plot(xx, p0, c="r", label="class 0 est. pdf")
plt.plot(xx, p1, c="b", label="class 1 est. pdf")
plt.legend()
plt.show()



In [46]:

    
x_new = -1
clf_norm.predict_proba([[x_new]])









    Out[46]:





array([[ 0.92774474,  0.07225526]])



In [50]:

    
px = sp.stats.norm(clf_norm.theta_, np.sqrt(clf_norm.sigma_)).pdf(x_new)
px









    Out[50]:





array([[ 0.27933708],
       [ 0.01450368]])



In [52]:

    
p = px.flatten() * clf_norm.class_prior_
p









    Out[52]:





array([ 0.11173483,  0.00870221])



In [53]:

    
p / p.sum()









    Out[53]:





array([ 0.92774474,  0.07225526])

베르누이 나이브베이즈



In [236]:

    
np.random.seed(0)
X = np.random.randint(2, size=(10, 4))
y = np.array([0,0,0,0,1,1,1,1,1,1])
print(X)
print(y)









    



[[0 1 1 0]
 [1 1 1 1]
 [1 1 1 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 1 1 0]
 [0 1 1 1]
 [1 0 1 0]
 [1 0 1 1]
 [0 1 1 0]]
[0 0 0 0 1 1 1 1 1 1]



In [237]:

    
from sklearn.naive_bayes import BernoulliNB
clf_bern = BernoulliNB().fit(X, y)



In [238]:

    
clf_bern.classes_









    Out[238]:





array([0, 1])



In [239]:

    
clf_bern.class_count_









    Out[239]:





array([ 4.,  6.])



In [243]:

    
np.exp(clf_bern.class_log_prior_)









    Out[243]:





array([ 0.4,  0.6])



In [245]:

    
fc = clf_bern.feature_count_
fc









    Out[245]:





array([[ 2.,  4.,  3.,  1.],
       [ 2.,  3.,  5.,  3.]])



In [246]:

    
theta = np.exp(clf_bern.feature_log_prob_)
theta









    Out[246]:





array([[ 0.5       ,  0.83333333,  0.66666667,  0.33333333],
       [ 0.375     ,  0.5       ,  0.75      ,  0.5       ]])



In [247]:

    
from sklearn.naive_bayes import MultinomialNB
clf_mult = MultinomialNB().fit(X,y)



In [248]:

    
clf_mult.classes_









    Out[248]:





array([0, 1])



In [249]:

    
clf_mult.class_count_









    Out[249]:





array([ 4.,  6.])



In [250]:

    
fc = clf_mult.feature_count_
fc









    Out[250]:





array([[ 2.,  4.,  3.,  1.],
       [ 2.,  3.,  5.,  3.]])



In [251]:

    
fc / np.repeat(fc.sum(axis=1)[:,np.newaxis], 4, axis=1)









    Out[251]:





array([[ 0.2       ,  0.4       ,  0.3       ,  0.1       ],
       [ 0.15384615,  0.23076923,  0.38461538,  0.23076923]])



In [252]:

    
clf_mult.alpha









    Out[252]:





1.0



In [254]:

    
# 손으로도 구할수 있어야해요...ㅠ

(fc + clf_mult.alpha) / (np.repeat(fc.sum(axis=1)[:, np.newaxis], 4, axis=1) + clf_mult.alpha * X.shape[1])









    Out[254]:





array([[ 0.21428571,  0.35714286,  0.28571429,  0.14285714],
       [ 0.17647059,  0.23529412,  0.35294118,  0.23529412]])



In [255]:

    
theta = np.exp(clf_mult.feature_log_prob_)
theta









    Out[255]:





array([[ 0.21428571,  0.35714286,  0.28571429,  0.14285714],
       [ 0.17647059,  0.23529412,  0.35294118,  0.23529412]])



In [256]:

    
x_new = np.array([21, 35, 29, 14])
clf_mult.predict_proba([x_new])









    Out[256]:





array([[ 0.99431066,  0.00568934]])



In [257]:

    
p = (theta**x_new).prod(axis=1)*np.exp(clf_bern.class_log_prior_)
p / p.sum()









    Out[257]:





array([ 0.99431066,  0.00568934])



In [258]:

    
x_new = np.array([18, 24, 35, 24])
clf_mult.predict_proba([x_new])









    Out[258]:





array([[ 0.00189418,  0.99810582]])



In [259]:

    
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
news = fetch_20newsgroups(subset="all")
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.1, random_state=1)



In [262]:

    
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf_0 = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', LogisticRegression()),
    ])
clf_1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('clf', MultinomialNB()),
        ])
clf_2 = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', MultinomialNB()),
        ])
clf_3 = Pipeline([
            ('vect', TfidfVectorizer(token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
            ('clf', MultinomialNB()),
        ])
clf_4 = Pipeline([
            ('vect', TfidfVectorizer(stop_words="english", 
                                     token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
            ('clf', MultinomialNB()),
        ])
clf_5 = Pipeline([
            ('vect', TfidfVectorizer(stop_words="english", 
                                     token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
            ('clf', MultinomialNB(alpha=0.01)),
        ])



In [263]:

    
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

for i, clf in enumerate([clf_0, clf_1, clf_2, clf_3, clf_4, clf_5]):
    scores = cross_val_score(clf, X_test, y_test, cv=5)
    print(("Model {0:d}: Mean score: {1: .3f} (+/-{2: .3f})").format(i, np.mean(scores), sem(scores)))









    



Model 0: Mean score:  0.718 (+/- 0.006)
Model 1: Mean score:  0.607 (+/- 0.005)
Model 2: Mean score:  0.548 (+/- 0.013)
Model 3: Mean score:  0.614 (+/- 0.008)
Model 4: Mean score:  0.741 (+/- 0.004)
Model 5: Mean score:  0.808 (+/- 0.008)



In [270]:

    
clf0_fit = clf_0.fit(X_train, y_train)



In [272]:

    
clf0_predict = clf0_fit.predict(X_test)



In [278]:

    
confusion_matrix(y_test, clf0_predict)
print(classification_report(y_test, clf0_predict))









    



             precision    recall  f1-score   support

          0       0.96      0.91      0.93        78
          1       0.84      0.86      0.85       101
          2       0.90      0.92      0.91        96
          3       0.85      0.89      0.87       110
          4       0.88      0.89      0.89       101
          5       0.95      0.86      0.90       101
          6       0.91      0.94      0.92        93
          7       0.92      0.90      0.91       109
          8       0.96      0.98      0.97       103
          9       0.97      0.94      0.95       101
         10       0.99      0.99      0.99       110
         11       0.94      0.96      0.95       104
         12       0.79      0.87      0.83        79
         13       0.93      0.96      0.95        90
         14       0.99      0.96      0.97       101
         15       0.94      0.97      0.96       110
         16       0.95      0.93      0.94        98
         17       0.99      0.99      0.99        77
         18       0.95      0.91      0.93        78
         19       0.85      0.76      0.80        45

avg / total       0.93      0.92      0.92      1885



In [ ]: