In [88]:
import pandas as pd
%matplotlib inline
from sklearn import cross_validation
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
In [76]:
iris = datasets.load_iris()
In [77]:
x = iris.data[:,2:]
y = iris.target
In [96]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y, random_state=42, test_size=0.25,train_size=0.75)
In [97]:
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)
Out[97]:
In [98]:
print("accuracy on training set: %f" % forest.score(X_train, y_train))
print("accuracy on test set: %f" % forest.score(X_test, y_test))
In [99]:
dt = tree.DecisionTreeClassifier()
In [100]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size=0.25,train_size=0.75)
In [101]:
dt = dt.fit(x_train,y_train)
In [102]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [103]:
measure_performance(x_train,y_train,dt)
In [104]:
measure_performance(x_test,y_test,dt)
Given the % of accuracy for both models, I would say that the "original" decision tree is overfitting the training data, whereas the random forest model's tuning seems to be finer, and therefore less fit to the training dataset (i.e., its accuracy is abit lower).
In [ ]: