In [22]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
In [4]:
iris = datasets.load_iris()
In [5]:
x = iris.data[:,2:]
y = iris.target
In [6]:
dt = tree.DecisionTreeClassifier()
In [7]:
dt = dt.fit(x,y)
In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [43]:
forest = RandomForestClassifier(n_estimators=10, random_state=5)
forest.fit(x_train, y_train)
Out[43]:
In [44]:
print("accuracy on training set: %f" % forest.score(x_train, y_train))
print("accuracy on test set: %f" % forest.score(x_test, y_test))
In [45]:
#Without Random Forest
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
dt = dt.fit(x_train,y_train)
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [46]:
measure_performance(x_test,y_test,dt) #measure on the test data
In [28]:
#OK it looked the same when I left random_state to 2 but when I upped it to five
#it became super accurate
#Explanation in the notes:
#You should keep in mind that random forests, by their nature, are random, and setting different random states (or
#not setting the random_state at all) can drastically change the model that is built. The more trees there are in
#the forest, the more robust it will be against the choice of random state. If you want to have reproducible results, it
#is important to fix the random_state.
#http://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
In [ ]: