notebook.community

Edit and run



In [1]:

    
import pandas as pd
%matplotlib inline
from sklearn import cross_validation
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt



In [2]:

    
iris = datasets.load_iris()



In [3]:

    
x = iris.data[:,2:] 
y = iris.target



In [4]:

    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y, random_state=42, test_size=0.25,train_size=0.75)



In [5]:

    
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)









    Out[5]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)



In [8]:

    
print("The Accuracy of the training set currently is: %f" % forest.score(X_train, y_train))
print("The Accuracy of the test set currently is: %f" % forest.score(X_test, y_test))









    



The Accuracy of the training set currently is: 0.990991
The Accuracy of the test set currently is: 0.948718



In [9]:

    
dt = tree.DecisionTreeClassifier()



In [10]:

    
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size=0.25,train_size=0.75)



In [11]:

    
dt = dt.fit(x_train,y_train)



In [15]:

    
def performance_depicter(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")



In [16]:

    
performance_depicter(x_train,y_train,dt)









    



Accuracy:0.991 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        35
          1       1.00      0.98      0.99        41
          2       0.97      1.00      0.99        36

avg / total       0.99      0.99      0.99       112
 

Confusion matrix
[[35  0  0]
 [ 0 40  1]
 [ 0  0 36]]



In [17]:

    
performance_depicter(x_test,y_test,dt)









    



Accuracy:0.974 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.90      1.00      0.95         9
          2       1.00      0.93      0.96        14

avg / total       0.98      0.97      0.97        38
 

Confusion matrix
[[15  0  0]
 [ 0  9  0]
 [ 0  1 13]]

I think the Random Forest Model appear is less accurate regarding the training data set. The original decision tree appears to be more precise overall.



In [ ]: