In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_color_codes()
%matplotlib inline
%config InlineBackend.figure_format='png'
In [2]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
'The last document?',
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_
Out[2]:
In [3]:
vect = CountVectorizer(max_df=4, min_df=2).fit(corpus)
In [4]:
vect.vocabulary_, vect.stop_words_
Out[4]:
In [5]:
vect = CountVectorizer().fit(corpus)
In [8]:
vect.vocabulary_
Out[8]:
In [7]:
vect.transform(corpus).toarray().sum(axis=0)
Out[7]:
In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [10]:
tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()
Out[10]:
In [ ]:
In [193]:
X00 = sp.stats.norm(-2, 1).rvs(50)
X01 = sp.stats.norm(+2, 1).rvs(50)
X10 = sp.stats.norm(+3, 2).rvs(50)
X11 = sp.stats.norm(+7, 3).rvs(50)
X20 = sp.stats.norm(+5, 1).rvs(50)
X21 = sp.stats.norm(+3, 2).rvs(50)
X0 = np.vstack([X00,X01]).T
X1 = np.vstack([X10,X11]).T
X2 = np.vstack([X20,X21]).T
X = np.vstack([X0,X1,X2])
X.shape
y0 = np.zeros(50)
y1 = np.ones(50)
y2 = np.ones(50) * 2
y = np.hstack([y0,y1,y2])[:,np.newaxis]
y.shape
Xy = np.hstack([X,y])
In [197]:
X_df = pd.DataFrame(Xy, columns=['X0','X1','y'])
In [195]:
sns.pairplot(X_df, hue="y")
Out[195]:
In [201]:
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap=cmap)
Out[201]:
In [206]:
from sklearn.naive_bayes import GaussianNB
clf_norm = GaussianNB().fit(X,y)
In [207]:
clf_norm.classes_
Out[207]:
In [208]:
clf_norm.class_count_
Out[208]:
In [209]:
clf_norm.class_prior_
Out[209]:
In [210]:
clf_norm.theta_, clf_norm.sigma_
Out[210]:
In [212]:
y_predict = clf_norm.predict(X)
In [213]:
from sklearn.metrics import confusion_matrix
clf_result = confusion_matrix(y, y_predict)
In [214]:
clf_result
Out[214]:
In [225]:
from sklearn.metrics import classification_report
print(classification_report?
In [226]:
print(classification_report(y,y_predict))
In [228]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis(store_covariances=True).fit(X, y)
In [234]:
xmin, xmax = -10, 15
ymin, ymax = -10, 15
XX, YY = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)/1000), np.arange(ymin, ymax, (ymax-ymin)/1000))
ZZ = np.reshape(qda.predict(np.array([XX.ravel(), YY.ravel()]).T), XX.shape)
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.contourf(XX, YY, ZZ, cmap=cmap, alpha=0.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=cmap)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.show()
In [41]:
X0.mean(), X0.var(), X1.mean(), X1.var()
Out[41]:
In [31]:
from sklearn.naive_bayes import GaussianNB
clf_norm = GaussianNB().fit(X, y)
In [32]:
clf_norm.classes_ # 분류 하려는 class
Out[32]:
In [33]:
clf_norm.class_count_ # 0은 몇번 나옴? 1은 몇번 나옴?
Out[33]:
In [36]:
clf_norm.class_prior_ # 사전확률 P(y=0) , P(y=1)
Out[36]:
In [35]:
clf_norm.theta_, clf_norm.sigma_ # mean 과 variance
Out[35]:
In [43]:
# kernel density가 아닌 gaussian curve
xx = np.linspace(-6, 6, 100)
p0 = sp.stats.norm(clf_norm.theta_[0], clf_norm.sigma_[0]).pdf(xx)
p1 = sp.stats.norm(clf_norm.theta_[1], clf_norm.sigma_[1]).pdf(xx)
sns.distplot(X0, rug=True, kde=False, norm_hist=True, color="r", label="class 0 histogram")
sns.distplot(X1, rug=True, kde=False, norm_hist=True, color="b", label="class 1 histogram")
plt.plot(xx, p0, c="r", label="class 0 est. pdf")
plt.plot(xx, p1, c="b", label="class 1 est. pdf")
plt.legend()
plt.show()
In [46]:
x_new = -1
clf_norm.predict_proba([[x_new]])
Out[46]:
In [50]:
px = sp.stats.norm(clf_norm.theta_, np.sqrt(clf_norm.sigma_)).pdf(x_new)
px
Out[50]:
In [52]:
p = px.flatten() * clf_norm.class_prior_
p
Out[52]:
In [53]:
p / p.sum()
Out[53]:
In [236]:
np.random.seed(0)
X = np.random.randint(2, size=(10, 4))
y = np.array([0,0,0,0,1,1,1,1,1,1])
print(X)
print(y)
In [237]:
from sklearn.naive_bayes import BernoulliNB
clf_bern = BernoulliNB().fit(X, y)
In [238]:
clf_bern.classes_
Out[238]:
In [239]:
clf_bern.class_count_
Out[239]:
In [243]:
np.exp(clf_bern.class_log_prior_)
Out[243]:
In [245]:
fc = clf_bern.feature_count_
fc
Out[245]:
In [246]:
theta = np.exp(clf_bern.feature_log_prob_)
theta
Out[246]:
In [247]:
from sklearn.naive_bayes import MultinomialNB
clf_mult = MultinomialNB().fit(X,y)
In [248]:
clf_mult.classes_
Out[248]:
In [249]:
clf_mult.class_count_
Out[249]:
In [250]:
fc = clf_mult.feature_count_
fc
Out[250]:
In [251]:
fc / np.repeat(fc.sum(axis=1)[:,np.newaxis], 4, axis=1)
Out[251]:
In [252]:
clf_mult.alpha
Out[252]:
In [254]:
# 손으로도 구할수 있어야해요...ㅠ
(fc + clf_mult.alpha) / (np.repeat(fc.sum(axis=1)[:, np.newaxis], 4, axis=1) + clf_mult.alpha * X.shape[1])
Out[254]:
In [255]:
theta = np.exp(clf_mult.feature_log_prob_)
theta
Out[255]:
In [256]:
x_new = np.array([21, 35, 29, 14])
clf_mult.predict_proba([x_new])
Out[256]:
In [257]:
p = (theta**x_new).prod(axis=1)*np.exp(clf_bern.class_log_prior_)
p / p.sum()
Out[257]:
In [258]:
x_new = np.array([18, 24, 35, 24])
clf_mult.predict_proba([x_new])
Out[258]:
In [259]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
news = fetch_20newsgroups(subset="all")
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.1, random_state=1)
In [262]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
clf_0 = Pipeline([
('vect', CountVectorizer()),
('clf', LogisticRegression()),
])
clf_1 = Pipeline([
('vect', CountVectorizer()),
('clf', MultinomialNB()),
])
clf_2 = Pipeline([
('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
clf_3 = Pipeline([
('vect', TfidfVectorizer(token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB()),
])
clf_4 = Pipeline([
('vect', TfidfVectorizer(stop_words="english",
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB()),
])
clf_5 = Pipeline([
('vect', TfidfVectorizer(stop_words="english",
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB(alpha=0.01)),
])
In [263]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
for i, clf in enumerate([clf_0, clf_1, clf_2, clf_3, clf_4, clf_5]):
scores = cross_val_score(clf, X_test, y_test, cv=5)
print(("Model {0:d}: Mean score: {1: .3f} (+/-{2: .3f})").format(i, np.mean(scores), sem(scores)))
In [270]:
clf0_fit = clf_0.fit(X_train, y_train)
In [272]:
clf0_predict = clf0_fit.predict(X_test)
In [278]:
confusion_matrix(y_test, clf0_predict)
print(classification_report(y_test, clf0_predict))
In [ ]: