In [1]:
import pandas as pd
%matplotlib inline
from sklearn import cross_validation
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
iris = datasets.load_iris()

In [3]:
x = iris.data[:,2:] 
y = iris.target

In [4]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y, random_state=42, test_size=0.25,train_size=0.75)

In [5]:
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)


Out[5]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [8]:
print("The Accuracy of the training set currently is: %f" % forest.score(X_train, y_train))
print("The Accuracy of the test set currently is: %f" % forest.score(X_test, y_test))


The Accuracy of the training set currently is: 0.990991
The Accuracy of the test set currently is: 0.948718

In [9]:
dt = tree.DecisionTreeClassifier()

In [10]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size=0.25,train_size=0.75)

In [11]:
dt = dt.fit(x_train,y_train)

In [15]:
def performance_depicter(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")

In [16]:
performance_depicter(x_train,y_train,dt)


Accuracy:0.991 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        35
          1       1.00      0.98      0.99        41
          2       0.97      1.00      0.99        36

avg / total       0.99      0.99      0.99       112
 

Confusion matrix
[[35  0  0]
 [ 0 40  1]
 [ 0  0 36]] 


In [17]:
performance_depicter(x_test,y_test,dt)


Accuracy:0.974 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.90      1.00      0.95         9
          2       1.00      0.93      0.96        14

avg / total       0.98      0.97      0.97        38
 

Confusion matrix
[[15  0  0]
 [ 0  9  0]
 [ 0  1 13]] 

I think the Random Forest Model appear is less accurate regarding the training data set. The original decision tree appears to be more precise overall.


In [ ]: