In [21]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_color_codes()
%matplotlib inline
%config InlineBackend.figure_format='png'
In [22]:
from __future__ import division
In [25]:
a = -1/2*np.log2(1/2)-1/2*np.log2(1/2)
In [26]:
a
Out[26]:
In [28]:
-1/4*np.log2(1/4)-3/4*np.log2(3/4)
Out[28]:
In [54]:
import pydot
import StringIO
from IPython.core.display import Image
from sklearn.tree import export_graphviz
def draw_decision_tree(classifier):
# 버퍼 만들어줌.
dot_buf = StringIO.StringIO()
# 모델에서 dot 랭귀지로 바꿔줌
export_graphviz(classifier, out_file = dot_buf, feature_names = iris.feature_names)
# 스트링 버퍼에 dot랭귀지를 넣어줌 (매번 그림저장ㄴㄴ, 메모리에 ㄲ)
graph = pydot.graph_from_dot_data(dot_buf.getvalue())
# 이미지 png 렌더링
image = graph.create_png()
# 버퍼에 이미지 저장
image_buf = StringIO.StringIO()
image_buf.write(image)
return Image(image_buf.getvalue())
def plot_decision_regions(X, y, classifier, title):
resolution = 0.01
markers = ('s','^', 'o','^','v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = mpl.colors.ListedColormap(colors[:len(np.unique(y))])
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap = cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c = cmap(idx), marker = markers[idx], s = 80, label = cl)
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc = 'upper left')
plt.title(title)
plt.show()
In [50]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()
X = iris.data[:, [2, 3]]
y = iris.target
In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
In [52]:
tree1 = DecisionTreeClassifier(criterion='entropy', max_depth = 1).fit(X, y)
plot_decision_regions(X, y, tree1, "Depth 1")
In [60]:
draw_decision_tree(tree1)
Out[60]:
In [ ]:
In [63]:
# 타이타닉 생존자
df = pd.read_csv('http://dato.com/files/titanic.csv', index_col = 0)
In [65]:
df.head(1)
Out[65]:
In [66]:
feature_names = ["Pclass", "Age", "Sex"]
dfX = df[feature_names]
dfy = df["Survived"]
dfX.tail()
Out[66]:
In [67]:
# imputation NaN을 바꿔줘야함.
from sklearn.preprocessing import LabelEncoder
dfX.ix[:,"Sex"] = LabelEncoder().fit_transform(dfX["Sex"])
dfX.tail()
Out[67]:
In [68]:
dfX.ix[:,"Age"].fillna(int(dfX["Age"].mean()), inplace=True)
dfX.tail()
Out[68]:
In [69]:
from sklearn.preprocessing import OneHotEncoder
dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX["Pclass"].as_matrix()[:,np.newaxis]).toarray(),
columns = ['first_class', 'second_class', 'third_class'], index = dfX.index)
dfX = pd.concat([dfX, dfX2], axis = 1)
del(dfX["Pclass"])
dfX.tail()
Out[69]:
In [74]:
a = [[1],[2],[3]]
print(OneHotEncoder().fit_transform(a)).toarray()
In [75]:
# train, test set 나누기
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size = 0.25, random_state = 1)
In [81]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5).fit(X_train, y_train)
In [82]:
command_buf = StringIO.StringIO()
export_graphviz(model, out_file = command_buf, feature_names = ['age', 'sex', '1st_class', '2nd_class', '3rd_class'])
graph = pydot.graph_from_dot_data(command_buf.getvalue())
image = graph.create_png()
image_buf = StringIO.StringIO()
image_buf.write(image)
Image(image_buf.getvalue())
Out[82]:
In [83]:
from sklearn.datasets import fetch_20newsgroups
categories = [
"talk.religion.misc",
"comp.graphics",
"sci.space",
]
news = fetch_20newsgroups(subset = 'train', categories = categories, shuffle=True, random_state=1)
news_test = fetch_20newsgroups(subset='test', categories = categories, shuffle=True, random_state=2)
In [84]:
len(news.data)
Out[84]:
In [143]:
news.target_names
Out[143]:
In [139]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf_news1 = Pipeline([
('vect', TfidfVectorizer(stop_words="english")),
('clf', DecisionTreeClassifier(criterion='entropy', max_depth = 3, min_samples_leaf=5, )),
])
clf_news1.fit(news.data, news.target);
In [140]:
command_buf = StringIO.StringIO()
export_graphviz(model, out_file = command_buf)
graph = pydot.graph_from_dot_data(command_buf.getvalue())
image = graph.create_png()
image_buf = StringIO.StringIO()
image_buf.write(image)
Image(image_buf.getvalue())
Out[140]:
In [142]:
print(confusion_matrix(news_test.target, clf_news1.predict(news_test.data)))
print(classification_report(news.target, clf_news1.predict(news.data), digits=4))
print(classification_report(news_test.target, clf_news1.predict(news_test.data), digits=4))
In [ ]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(news.data)
y = news.target
In [ ]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy').fit(X,y)
In [ ]:
X_test = vect.transform(news_data.data)
y_test = news_test.target