In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
from sklearn import tree
from sklearn.cross_validation import train_test_split
from pandas.tools.plotting import scatter_matrix
In [11]:
iris = datasets.load_iris()
In [28]:
iris
Out[28]:
In [12]:
print(iris.feature_names)
In [30]:
type(iris['data'])
Out[30]:
In [3]:
characteristics = iris.data[:,2:]
species = iris.target
In [4]:
dt = tree.DecisionTreeClassifier()
dt = dt.fit(characteristics,species)
In [5]:
characteristics_train, characteristics_test, species_train, species_test = train_test_split(characteristics,species,test_size=0.5,train_size=0.5)
In [6]:
dt = dt.fit(characteristics_train,species_train)
In [7]:
dt
Out[7]:
In [21]:
from sklearn import metrics
In [18]:
def measure_performance(characteristics,species,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
species_pred=clf.predict(characteristics)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(species, species_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(species, species_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(species, species_pred),"\n")
In [10]:
measure_performance(characteristics_test,species_test,dt)
In [11]:
# ACCURACY: The model predicts the species correctly for 94.7% of plant samples
# PRECISION:
# Species 1 is predicted precisely for all cases -- no false negatives/false positives
# For species 2, the model predicted 96% of cases precisely as true positives, 4% were false positives
# For species 3, the model predicted 90% of cases precisely as true positives, 10% were false positives
# CONFUSION MATRIX
# 23 plant samples were classified as Iris species 1
# 22 plant samples were classified as Iris species 2, with 3 more being falsely labelled as species 3
# 26 plant samples were classified as Iris species 3, with 1 more being falsely labelled as species 2
# Given the fact that the model doesn't fit 100% it seems at least not to be overfitting
In [19]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, iris.target_names, rotation=45)
plt.yticks(tick_marks, iris.target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [13]:
species_pred = dt.fit(characteristics_train, species_train).predict(characteristics_test)
In [14]:
cm = metrics.confusion_matrix(species_test, species_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [15]:
In [19]:
characteristics_train, characteristics_test, species_train, species_test = train_test_split(characteristics,species,test_size=0.25,train_size=0.75)
In [20]:
measure_performance(characteristics_test,species_test,dt)
In [ ]:
# ACCURACY: The model predicts the species correctly for 100% of plant samples
# CONFUSION MATRIX
# 13 plant samples were classified as Iris species 1
# 10 plant samples were classified as Iris species 2
# 15 plant samples were classified as Iris species 3
# Maybe the test dataset is to small when setting it to a share of 25% of all data: so that the training data already
# covers all eventualities and thus is overfitting;
# not enough variability in test dataset to highlight inaccuracy of the model
# What's a good split for training vs test data? (Maybe depends on overall size?)
datasets.load_breast_cancer()) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [2]:
# Only one donor? (Nick Street) --> big issue with marker sensitivity in detection!
# With a males-sounding first name (having breast cancer in 1995, so at least 30 years old) for breast cancer cells?
In [3]:
cancer = datasets.load_breast_cancer()
In [ ]:
# Reading up on scikit learn -- documentation is not that good, this one is a bit better:
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/base.py
In [9]:
#WE ARE TRYING TO PREDICT WHETHER A TUMOR IS MALIGNANT OR BENIGN
print(cancer.target_names)
In [10]:
#THESE ARE ALL THE ATTRIBUTES AVAILABLE
print(cancer.feature_names)
In [8]:
#BASIC DESCRIPTIVE STATISTICS BELOW IN THE TABLE
print(cancer.DESCR)
In [13]:
markers = cancer.data[:,:]
seeds = cancer.target
In [14]:
dt = tree.DecisionTreeClassifier()
dt = dt.fit(markers,seeds)
In [15]:
markers_train, markers_test, seeds_train, seeds_test = train_test_split(markers,seeds,test_size=0.5,train_size=0.5)
In [16]:
dt = dt.fit(markers_train,seeds_train)
In [33]:
def measure_performance(markers,seeds,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
seeds_pred=clf.predict(markers)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(seeds, seeds_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(seeds, seeds_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(seeds, seeds_pred),"\n")
In [34]:
measure_performance(markers_test,seeds_test,dt)
In [ ]:
# malignant = 0
# benign = 1
# ACCURACY
# The classifier predicts 88 percent of samples correctly
# CONFUSION MATRIX:
# 47 samples are correctly predicted as malignant, whereas there are 5 that are malignant, but classified as benign
# 80 samples are correctly predicted as benign, whereas there are 11 that are benign, but classified as malignant
# PRECISION
# The matter outlined above translates to the following precision:
# For malignant samples, the model predicted 81% of cases precisely as true positives, 9% were false positives
# For benign samples, the model predicted 94% of cases precisely as true positives, 6% were false positives
In [35]:
markers_train, markers_test, seeds_train, seeds_test = train_test_split(markers,seeds,test_size=0.25,train_size=0.75)
In [36]:
dt = dt.fit(markers_train,seeds_train)
In [37]:
def measure_performance(markers,seeds,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
seeds_pred=clf.predict(markers)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(seeds, seeds_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(seeds, seeds_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(seeds, seeds_pred),"\n")
In [38]:
measure_performance(markers_test,seeds_test,dt)
In [ ]:
# With the 75-25 split, the classifier performs better
# ACCURACY
# The classifier predicts 94 percent of samples correctly
# CONFUSION MATRIX:
# 46 samples are correctly predicted as malignant, whereas there are 5 that are malignant, but classified as benign
# 89 samples are correctly predicted as benign, whereas there are 3 that are benign, but classified as malignant
# PRECISION
# The matter outlined above translates to the following precision:
# For malignant samples, the model predicted 94% of cases precisely as true positives, 6% were false positives
# For benign samples, the model predicted 95% of cases precisely as true positives, 5% were false positives