In [31]:
import pandas as pd
%matplotlib inline
In [32]:
from sklearn import datasets
from pandas.tools.plotting import scatter_matrix
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import numpy as np
In [33]:
import matplotlib.pyplot as plt
In [34]:
iris = datasets.load_iris()
In [51]:
iris
Out[51]:
In [50]:
iris['feature_names']
Out[50]:
In [35]:
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable
In [36]:
dt = tree.DecisionTreeClassifier()
In [37]:
#dt = dt.fit(x,y)
In [38]:
#Creating test validation
In [39]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [40]:
dt = dt.fit(x_train,y_train)
In [41]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [42]:
measure_performance(x_test,y_test,dt)
In [ ]:
#The last test results I would not consider good. 4 values that are predicted wrongly seem bad. That was the first go.
#When I ran the test through 2-3 times the results improved.
In [43]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
In [ ]:
dt = dt.fit(x_train,y_train)
In [44]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [45]:
measure_performance(x_test,y_test,dt)
In [46]:
#The results are better of course, as they the predictor has more data to work with.
datasets.load_breast_cancer()) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [ ]:
#We are predicting, whether, depending on the
In [48]:
breast_cancer = datasets.load_breast_cancer()
In [49]:
breast_cancer['feature_names']
Out[49]:
In [62]:
#breast_cancer.target
In [181]:
x = breast_cancer.data[:,20:] # the attributes
#With 29 only 64% accurancy, with 27 - 29 already 85%. 26 - 29 already 87%. with 25:29 is goes doen again 83%. WIth 20:
#it seems to work best. But is there a way to calculate the best combination of parameters to take?
y = breast_cancer.target # the target variable
In [190]:
x
Out[190]:
In [183]:
y
Out[183]:
In [184]:
dt = tree.DecisionTreeClassifier()
In [185]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [186]:
dt = dt.fit(x_train,y_train)
In [187]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [188]:
measure_performance(x_test,y_test,dt)
In [207]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
#Why do results vary?
In [208]:
dt = dt.fit(x_train,y_train)
In [209]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [210]:
measure_performance(x_test,y_test,dt)