In [1]:
from sklearn import datasets, tree, metrics
from sklearn.cross_validation import train_test_split
import numpy as np
dt = tree.DecisionTreeClassifier()
iris = datasets.load_iris()
x = iris.data[:,2:]
y = iris.target
In [2]:
# 50% - 50%
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
dt = dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
print("50%-50%")
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)),"\nClassification report:")
print(metrics.classification_report(y_test,y_pred),"\n")
print(metrics.confusion_matrix(y_test,y_pred),"\n")
In [3]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
dt = dt.fit(x_train,y_train)
In [4]:
y_pred=dt.predict(x_test)
print("75%-25%")
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)),"\n")
print(metrics.classification_report(y_test,y_pred),"\nClassification report:")
print(metrics.confusion_matrix(y_test,y_pred),"\n")
Comment
Maybe the 75-25 model is overfitting
Maybe reducing the test set increase the chances of having a high proportion of outliers in this set
datasets.load_breast_cancer()
) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [5]:
cancer = datasets.load_breast_cancer()
In [6]:
print("Here are the attributes we have:\n", cancer['DESCR'][1200:3057])
In [7]:
x = cancer.data[:,2:] # the attributes
y = cancer.target # the target variable
example_data = [i for i in x[0]]
print("Here's the a sample of these 32 attributes (first data row):")
print(*example_data)
print("We're trying to predict if a subject has cancer or not. Here is a sample of the targets:", y[20:30])
In [ ]:
In [ ]:
In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
dt = dt.fit(x_train,y_train)
In [9]:
y_pred=dt.predict(x_test)
print("50%-50%")
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)),"\nClassification report:")
print(metrics.classification_report(y_test,y_pred),"\n")
print(metrics.confusion_matrix(y_test,y_pred),"\n")
In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
dt = dt.fit(x_train,y_train)
In [11]:
y_pred=dt.predict(x_test)
print("75%-25%")
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)),"\nClassification report:")
print(metrics.classification_report(y_test,y_pred),"\n")
print(metrics.confusion_matrix(y_test,y_pred),"\n")
In [ ]: