In [20]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
In [2]:
iris = datasets.load_iris()
In [3]:
iris
Out[3]:
In [14]:
x = iris.data[:,2:]
y = iris.target
In [15]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, stratify=y,random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(x_train, y_train)
Out[15]:
In [16]:
print("accuracy on training set: %f" % forest.score(x_train, y_train))
print("accuracy on test set: %f" % forest.score(x_test, y_test))
In [17]:
dt = tree.DecisionTreeClassifier()
In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [22]:
dt = dt.fit(x_train,y_train)
In [23]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [26]:
measure_performance(x_train,y_train,dt)
# I measure the performance of my classifier with train data
#The accuracy is 1, which means is 100% accurate.
#And my confusion matrix is not showing mistakes in the classification
In [27]:
measure_performance(x_test,y_test,dt)
# I measure the performance of my classifier with test data
# Accuracy of 100%
My main takeaway is that random forests are a way of addressing the problem of overfitting. Decision trees tend to overfit the training data, and since random forests are made up of a number of these decision trees, they are all going to overfit the data in different ways. So what we do is averaging the results of all of the trees in our random forest to get a more accurate fit. The accuracy of the training set for the Random Forest Classifier is of 98% (and I am not sure about the following ...) which means that the model is not overfitting. On the contrary, the accuracy of the training set for the desicion tree model is of 100%, which probably means is overfitting. The accuracy test for the decision tree model is better than the one for the random forest classifier, which confused me a little bit since I was expecting the one for the random forest classifier to be better. If the data is not overfitted, the model is more likely to be more accurate right?
In [ ]: