Classify Iris (flowers) by their sepal/petal width/length to their species: 'setosa' 'versicolor' 'virginica'
In [10]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from plotting_utilities import plot_decision_tree, plot_feature_importances
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
iris = load_iris()
iris.DESCR.split('\n')
Out[10]:
In [11]:
# IN: Features aka Predictors
print(iris.data.dtype)
print(iris.data.shape)
print(iris.feature_names)
iris.data[:5,:]
Out[11]:
In [12]:
# OUT: Target, here: species
print(iris.target.dtype)
print(iris.target.shape)
print(iris.target_names)
iris.target[:5]
Out[12]:
In [13]:
X = iris.data
y = iris.target
# TODO: Try with and without max_depth (setting also avoids overfitting)
# clf = DecisionTreeClassifier().fit(X, y)
clf = DecisionTreeClassifier(max_depth = 3).fit(X, y)
plot_decision_tree(clf, iris.feature_names, iris.target_names)
Out[13]:
In [14]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
# Train the classifier only with the trainings data
clf = DecisionTreeClassifier().fit(X_train, y_train)
In [15]:
# predict for the test data and compare with the actual outcome
y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(" ------ Predicted ")
print(" Actual ")
confusion_matrix(y_test, y_pred)
Out[15]:
In [16]:
print('Accuracy of Decision Tree classifier on test set == sum(TP)/sum(): {}'.format((15+11+11)/(15+11+11+1)))
print('Accuracy of Decision Tree classifier on test set with "score"-function: {:.2f}'
.format(clf.score(X_test, y_test)))
In [17]:
plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf, np.array(iris.feature_names))
plt.show()
print('Feature names : {}'.format(iris.feature_names))
print('Feature importances: {}'.format(clf.feature_importances_))