notebook.community

Edit and run



In [21]:

    
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_color_codes()

%matplotlib inline
%config InlineBackend.figure_format='png'



In [22]:

    
from __future__ import division



In [25]:

    
a = -1/2*np.log2(1/2)-1/2*np.log2(1/2)



In [26]:

    
a









    Out[26]:





1.0



In [28]:

    
-1/4*np.log2(1/4)-3/4*np.log2(3/4)









    Out[28]:





0.81127812445913283



In [54]:

    
import pydot
import StringIO
from IPython.core.display import Image
from sklearn.tree import export_graphviz


def draw_decision_tree(classifier):
    # 버퍼 만들어줌.
    dot_buf = StringIO.StringIO()
    # 모델에서 dot 랭귀지로 바꿔줌
    export_graphviz(classifier, out_file = dot_buf, feature_names = iris.feature_names)
    # 스트링 버퍼에 dot랭귀지를 넣어줌 (매번 그림저장ㄴㄴ, 메모리에 ㄲ)
    graph = pydot.graph_from_dot_data(dot_buf.getvalue())
    # 이미지 png 렌더링
    image = graph.create_png()
    # 버퍼에 이미지 저장
    image_buf = StringIO.StringIO()
    image_buf.write(image)
    return Image(image_buf.getvalue())


def plot_decision_regions(X, y, classifier, title):
    resolution = 0.01
    markers = ('s','^', 'o','^','v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = mpl.colors.ListedColormap(colors[:len(np.unique(y))])
    
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap = cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c = cmap(idx), marker = markers[idx], s = 80, label = cl)
    
    plt.xlabel('petal length [cm]')
    plt.ylabel('petal width [cm]')
    plt.legend(loc = 'upper left')
    plt.title(title)
    plt.show()



In [50]:

    
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

iris = load_iris()
X = iris.data[:, [2, 3]]
y = iris.target



In [51]:

    
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix



In [52]:

    
tree1 = DecisionTreeClassifier(criterion='entropy', max_depth = 1).fit(X, y)
plot_decision_regions(X, y, tree1, "Depth 1")



In [60]:

    
draw_decision_tree(tree1)









    Out[60]:



In [ ]:



In [63]:

    
# 타이타닉 생존자
df = pd.read_csv('http://dato.com/files/titanic.csv', index_col = 0)



In [65]:

    
df.head(1)









    Out[65]:






  
    
      
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.25
      NaN
      S



In [66]:

    
feature_names = ["Pclass", "Age", "Sex"]
dfX = df[feature_names]
dfy = df["Survived"]
dfX.tail()









    Out[66]:






  
    
      
      Pclass
      Age
      Sex
    
    
      PassengerId
      
      
      
    
  
  
    
      887
      2
      27.0
      male
    
    
      888
      1
      19.0
      female
    
    
      889
      3
      NaN
      female
    
    
      890
      1
      26.0
      male
    
    
      891
      3
      32.0
      male



In [67]:

    
# imputation NaN을 바꿔줘야함.
from sklearn.preprocessing import LabelEncoder
dfX.ix[:,"Sex"] = LabelEncoder().fit_transform(dfX["Sex"])
dfX.tail()









    



/home/rrbb/.pyenv/versions/2.7.11/envs/python2/lib/python2.7/site-packages/pandas/core/indexing.py:465: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s






    Out[67]:






  
    
      
      Pclass
      Age
      Sex
    
    
      PassengerId
      
      
      
    
  
  
    
      887
      2
      27.0
      1
    
    
      888
      1
      19.0
      0
    
    
      889
      3
      NaN
      0
    
    
      890
      1
      26.0
      1
    
    
      891
      3
      32.0
      1



In [68]:

    
dfX.ix[:,"Age"].fillna(int(dfX["Age"].mean()), inplace=True)
dfX.tail()









    



/home/rrbb/.pyenv/versions/2.7.11/envs/python2/lib/python2.7/site-packages/pandas/core/generic.py:3191: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)






    Out[68]:






  
    
      
      Pclass
      Age
      Sex
    
    
      PassengerId
      
      
      
    
  
  
    
      887
      2
      27.0
      1
    
    
      888
      1
      19.0
      0
    
    
      889
      3
      29.0
      0
    
    
      890
      1
      26.0
      1
    
    
      891
      3
      32.0
      1



In [69]:

    
from sklearn.preprocessing import OneHotEncoder
dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX["Pclass"].as_matrix()[:,np.newaxis]).toarray(),
                   columns = ['first_class', 'second_class', 'third_class'], index = dfX.index)
dfX = pd.concat([dfX, dfX2], axis = 1)
del(dfX["Pclass"])
dfX.tail()









    Out[69]:






  
    
      
      Age
      Sex
      first_class
      second_class
      third_class
    
    
      PassengerId
      
      
      
      
      
    
  
  
    
      887
      27.0
      1
      0.0
      1.0
      0.0
    
    
      888
      19.0
      0
      1.0
      0.0
      0.0
    
    
      889
      29.0
      0
      0.0
      0.0
      1.0
    
    
      890
      26.0
      1
      1.0
      0.0
      0.0
    
    
      891
      32.0
      1
      0.0
      0.0
      1.0



In [74]:

    
a = [[1],[2],[3]]
print(OneHotEncoder().fit_transform(a)).toarray()









    



[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]



In [75]:

    
# train, test set 나누기
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size = 0.25, random_state = 1)



In [81]:

    
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5).fit(X_train, y_train)



In [82]:

    
command_buf = StringIO.StringIO()
export_graphviz(model, out_file = command_buf, feature_names = ['age', 'sex', '1st_class', '2nd_class', '3rd_class'])
graph = pydot.graph_from_dot_data(command_buf.getvalue())
image = graph.create_png()
image_buf = StringIO.StringIO()
image_buf.write(image)
Image(image_buf.getvalue())









    Out[82]:



In [83]:

    
from sklearn.datasets import fetch_20newsgroups
categories = [
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

news = fetch_20newsgroups(subset = 'train', categories = categories, shuffle=True, random_state=1)
news_test = fetch_20newsgroups(subset='test', categories = categories, shuffle=True, random_state=2)



In [84]:

    
len(news.data)









    Out[84]:





1554



In [143]:

    
news.target_names









    Out[143]:





['comp.graphics', 'sci.space', 'talk.religion.misc']



In [139]:

    
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

clf_news1 = Pipeline([
        ('vect', TfidfVectorizer(stop_words="english")),
        ('clf', DecisionTreeClassifier(criterion='entropy', max_depth = 3, min_samples_leaf=5, )),
    ])

clf_news1.fit(news.data, news.target);



In [140]:

    
command_buf = StringIO.StringIO()
export_graphviz(model, out_file = command_buf)
graph = pydot.graph_from_dot_data(command_buf.getvalue())
image = graph.create_png()
image_buf = StringIO.StringIO()
image_buf.write(image)
Image(image_buf.getvalue())









    Out[140]:



In [142]:

    
print(confusion_matrix(news_test.target, clf_news1.predict(news_test.data)))
print(classification_report(news.target, clf_news1.predict(news.data), digits=4))
print(classification_report(news_test.target, clf_news1.predict(news_test.data), digits=4))









    



[[375  14   0]
 [194 200   0]
 [184   3  64]]
             precision    recall  f1-score   support

          0     0.5249    0.9914    0.6864       584
          1     0.9871    0.5177    0.6792       593
          2     0.9786    0.3634    0.5300       377

avg / total     0.8114    0.6583    0.6457      1554

             precision    recall  f1-score   support

          0     0.4980    0.9640    0.6567       389
          1     0.9217    0.5076    0.6547       394
          2     1.0000    0.2550    0.4063       251

avg / total     0.7813    0.6180    0.5952      1034



In [ ]:

    
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(news.data)
y = news.target



In [ ]:

    
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy').fit(X,y)



In [ ]:

    
X_test = vect.transform(news_data.data)
y_test = news_test.target

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
PassengerId
1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.25	NaN	S

	Pclass	Age	Sex
PassengerId
887	2	27.0	male
888	1	19.0	female
889	3	NaN	female
890	1	26.0	male
891	3	32.0	male