In [189]:
import numpy as np
import pandas as pd
import pydotplus
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.tools.plotting import scatter_matrix
from sklearn import datasets
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.externals.six import StringIO
In [190]:
iris = datasets.load_iris()
iris
Out[190]:
In [193]:
x = iris.data[:,2:]
y = iris.target
dt = tree.DecisionTreeClassifier()
In [194]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [195]:
dt = dt.fit(x_train,y_train)
In [196]:
def measure_performance(x,y,dt, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
y_pred=dt.predict(x)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confusion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [197]:
measure_performance(x_test,y_test,dt)
Nearly 95% accuracy, with 100% precision for the first species (0), and progressively less precision for the latter two species (1, 2) .
30 were classified as species 1, 26 as species 2, and 15 as species 3; though there were two falsely classified samples in both species 2 and species 3.
In [198]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
dt = dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
measure_performance(x_test,y_test,dt)
Over 97% accuracy, with again 100% precision for the first species (0), and a similar percent accuracy for the latter two species (1, 2) .
12 were classified as species 1, 14 as species 2, and 11 as species 3; and there was one falsely classified sample in species 3.
There are less total samples because the testing size is larger.
datasets.load_breast_cancer()
) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [199]:
bc = datasets.load_breast_cancer()
bc
Out[199]:
The attributes (x) are listed near the top of the dataset, and include radius, texture, perimeter, etc. The target (y) is what we are trying to predict. Here, that is whether a tumor is malignant or benign.
In [200]:
x = bc.data[:,:]
y = bc.target
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
dt = dt.fit(x_train,y_train)
In [201]:
def measure_performance(x,y,dt, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
y_pred=dt.predict(x)
if show_accuracy:
print("Accuracy:{0:.5f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confusion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [202]:
measure_performance(x_test,y_test,dt)
In [203]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
dt = dt.fit(x_train,y_train)
In [204]:
measure_performance(x_test,y_test,dt)
Over 95% accuracy, with rougly 95% precision. 7 out of 143 samples were misclassified.