In [4]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import tree
In [5]:
iris = datasets.load_iris()
In [6]:
x = iris.data[:,2:] #attributes
y = iris.target #target variable
In [7]:
dt = tree.DecisionTreeClassifier()
In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [9]:
dt = dt.fit(x_train,y_train)
In [10]:
from sklearn import metrics
In [11]:
import numpy as np
In [12]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [13]:
measure_performance(x_test,y_test,dt)
In [14]:
#Accuracy: Out of all the predicted outcomes (true positive, false positive, true negative, false negative), how many are true positives or negatives?
# a score of .973 means that 97.3 percent of those classifed were true positives and true negatives.
# While 2.7 percent were false positives (said to be true when it was false) or
# false negatives (said to be false when it was true )
#Precision: When the condition was predicted to be true (true positives, false positives),
#how many of those 'true' predictions were true positives
#a score of 1 means that all of the cases predicted to be true were true positives
#a score of .93 means 93 percent of those classified as 1 were in class 1 (tp)
#while 7 percent of those classified as 1 were not in class 1 (fp, we said it was true when it was false)
#Recall: When the actual condition was true,how many were predicted true positives?
#a score of 1 means that all the cases that were actually true were predicted to be true
#a score of .9 means that when the actual condition was true, 90% were predicted to be true, and 10% were predicted
#to be false (false negatives, we said it was false when it was true)
#confusion matrix: rows are the actual condition: class 0, class 1, class 2
#columns are the predicted conditions, predicted_class 0, predicted_class 1, predicted class 2
#so the matrix tells us that when the class was actually 2, we predicted it was class 1 twice
In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [16]:
dt = dt.fit(x_train,y_train)
In [17]:
measure_performance(x_test,y_test,dt)
In [18]:
# they're worse, in a way.
# but i suspect that the first model was overfitting the data.
datasets.load_breast_cancer()
) and perform basic exploratory analysis. What attributes do we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [19]:
breast_cancer = datasets.load_breast_cancer()
In [20]:
breast_cancer['feature_names']
Out[20]:
In [21]:
breast_cancer['target']
Out[21]:
In [22]:
x = breast_cancer.data[:,:]
y = breast_cancer.target
In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [24]:
dt = dt.fit(x_train,y_train)
In [25]:
measure_performance(x_test,y_test,dt)
In [26]:
# when benign (0), 16 cases were said to be malignant when they were not
# when malignant (1), 7 cases were said to be benign when they were not
In [27]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [28]:
dt = dt.fit(x_train,y_train)
In [29]:
measure_performance(x_test,y_test,dt)
In [ ]:
# the model is better.
# when when the actual case was benign, it predicted that it was malignant 4 times
# when the actual case when malignant, it predicted that it was benign once.
# perhaps this is because the data set is larger, and there is a relationship between the size of data set and
# the split between training and testing