In [26]:
import pandas as pd
import pydotplus
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets, tree, metrics
from sklearn.cross_validation import train_test_split
from pandas.tools.plotting import scatter_matrix
In [27]:
iris = datasets.load_iris()
In [28]:
x = iris.data[:,2:]
y = iris.target
In [29]:
dt = tree.DecisionTreeClassifier()
In [30]:
# 50%-50%
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.50,train_size=0.50)
dt = dt.fit(x_train,y_train)
In [31]:
def measure_performance(x,y,dt, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
y_pred=dt.predict(x)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confusion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [32]:
measure_performance(x_test,y_test,dt)
In [33]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [34]:
measure_performance(x_train,y_train,dt)
In [35]:
measure_performance(x_test,y_test,dt)
datasets.load_breast_cancer()) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [36]:
bc = datasets.load_breast_cancer()
In [37]:
x = bc.data[:,2:]
y = bc.target
In [38]:
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x,y)
In [39]:
dt
Out[39]:
In [42]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.50,train_size=0.50)
In [44]:
measure_performance(x_test,y_test,dt)
In [47]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [48]:
measure_performance(x_test,y_test,dt)
In [ ]: