In [88]:

    
import pandas as pd
%matplotlib inline
from sklearn import cross_validation
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt



In [76]:

    
iris = datasets.load_iris()



In [77]:

    
x = iris.data[:,2:] 
y = iris.target



In [96]:

    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y, random_state=42, test_size=0.25,train_size=0.75)



In [97]:

    
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)









    Out[97]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)



In [98]:

    
print("accuracy on training set: %f" % forest.score(X_train, y_train))
print("accuracy on test set: %f" % forest.score(X_test, y_test))









    



accuracy on training set: 0.990991
accuracy on test set: 0.948718

Original Decision Tree



In [99]:

    
dt = tree.DecisionTreeClassifier()



In [100]:

    
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size=0.25,train_size=0.75)



In [101]:

    
dt = dt.fit(x_train,y_train)



In [102]:

    
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")



In [103]:

    
measure_performance(x_train,y_train,dt)









    



Accuracy:0.991 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        41
          1       1.00      0.97      0.99        35
          2       0.97      1.00      0.99        36

avg / total       0.99      0.99      0.99       112
 

Confusion matrix
[[41  0  0]
 [ 0 34  1]
 [ 0  0 36]]



In [104]:

    
measure_performance(x_test,y_test,dt)









    



Accuracy:1.000 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         9
          1       1.00      1.00      1.00        15
          2       1.00      1.00      1.00        14

avg / total       1.00      1.00      1.00        38
 

Confusion matrix
[[ 9  0  0]
 [ 0 15  0]
 [ 0  0 14]]

Given the % of accuracy for both models, I would say that the "original" decision tree is overfitting the training data, whereas the random forest model's tuning seems to be finer, and therefore less fit to the training dataset (i.e., its accuracy is abit lower).



In [ ]: