notebook.community

Edit and run



In [1]:

    
import pandas as pd
%matplotlib inline



In [2]:

    
from sklearn import tree



In [3]:

    
from sklearn import datasets



In [28]:

    
from pandas.tools.plotting import scatter_matrix



In [41]:

    
import matplotlib.pyplot as plt



In [5]:

    
iris = datasets.load_iris() # load iris data set



In [6]:

    
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable



In [8]:

    
dt = tree.DecisionTreeClassifier()



In [9]:

    
dt = dt.fit(x,y)

Now what?



In [16]:

    
from sklearn.externals.six import StringIO
import pydotplus #pip install pydotplus



In [17]:

    
with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(dt, out_file=f)



In [18]:

    
import os
os.unlink('iris.dot')



In [61]:

    
dot_data = StringIO() 
tree.export_graphviz(dt, out_file=dot_data) #brew install graphviz
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("iris.pdf")









    Out[61]:





True



In [62]:

    
# from http://stackoverflow.com/questions/19470099/view-pdf-image-in-an-ipython-notebook
from wand.image import Image as WImage #pip install wand; brew install freetype imagemagick
img = WImage(filename='iris.pdf')
img









    Out[62]:

Let's actually do some testing of our model



In [32]:

    
from sklearn.cross_validation import train_test_split



In [49]:

    
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,train_size=0.66)



In [56]:

    
dt = dt.fit(x_train,y_train)



In [38]:

    
from sklearn import metrics



In [47]:

    
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n"
    if show_classification_report:
        print "Classification report"
        print metrics.classification_report(y,y_pred),"\n"
    if show_confussion_matrix:
        print "Confusion matrix"
        print metrics.confusion_matrix(y,y_pred),"\n"



In [51]:

    
measure_performance(x_train,y_train,dt)









    



Accuracy:0.990 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        35
          1       0.97      1.00      0.98        32
          2       1.00      0.97      0.98        32

avg / total       0.99      0.99      0.99        99


Confusion matrix
[[35  0  0]
 [ 0 32  0]
 [ 0  1 31]]



In [52]:

    
measure_performance(x_test,y_test,dt)









    



Accuracy:0.980 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.94      1.00      0.97        17
          2       1.00      0.94      0.97        18

avg / total       0.98      0.98      0.98        50


Confusion matrix
[[15  0  0]
 [ 0 17  0]
 [ 0  1 17]]



In [53]:

    
from sklearn.cross_validation import cross_val_score



In [59]:

    
# http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
scores = cross_val_score(dt,x_test,y_test,cv=10)



In [60]:

    
scores









    Out[60]:





array([ 1. ,  1. ,  0.8,  0.8,  1. ,  1. ,  1. ,  0.8,  1. ,  1. ])



In [ ]: