• Using the readings, try and create a RandomForestClassifier for the iris dataset.
  • Using a 25/75 training/test split, compare the results with the original decision tree model and describe the result to the best of your ability in your PR

In [22]:
import pandas as pd
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics

In [4]:
iris = datasets.load_iris()

In [5]:
x = iris.data[:,2:]
y = iris.target

In [6]:
dt = tree.DecisionTreeClassifier()

In [7]:
dt = dt.fit(x,y)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)

In [43]:
forest = RandomForestClassifier(n_estimators=10, random_state=5)
forest.fit(x_train, y_train)


Out[43]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=5, verbose=0, warm_start=False)

In [44]:
print("accuracy on training set: %f" % forest.score(x_train, y_train))
print("accuracy on test set: %f" % forest.score(x_test, y_test))


accuracy on training set: 0.982143
accuracy on test set: 0.973684

In [45]:
#Without Random Forest
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
dt = dt.fit(x_train,y_train)
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")

In [46]:
measure_performance(x_test,y_test,dt) #measure on the test data


Accuracy:0.921 

Classification report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.75      1.00      0.86         9
          2       1.00      0.70      0.82        10

avg / total       0.94      0.92      0.92        38
 

Confusion matrix
[[19  0  0]
 [ 0  9  0]
 [ 0  3  7]] 


In [28]:
#OK it looked the same when I left random_state to 2 but when I upped it to five
#it became super accurate

#Explanation in the notes:
#You should keep in mind that random forests, by their nature, are random, and setting different random states (or
#not setting the random_state at all) can drastically change the model that is built. The more trees there are in
#the forest, the more robust it will be against the choice of random state. If you want to have reproducible results, it
#is important to fix the random_state.

#http://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn

In [ ]: