In [1]:
from sklearn import datasets
import pandas as pd
%matplotlib inline
from sklearn import datasets
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import tree
In [2]:
iris = datasets.load_iris()
In [3]:
iris
Out[3]:
In [4]:
iris.keys()
Out[4]:
In [5]:
iris['target']
Out[5]:
In [6]:
iris['target_names']
Out[6]:
In [7]:
iris['data']
Out[7]:
In [8]:
iris['feature_names']
Out[8]:
In [9]:
x = iris.data[:,2:] # the attributes # we are picking up only the info on petal length and width
y = iris.target # the target variable
In [10]:
# The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
dt = tree.DecisionTreeClassifier()
In [11]:
# .fit testing
dt = dt.fit(x,y)
In [12]:
from sklearn.cross_validation import train_test_split
In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.50,train_size=0.50)
In [14]:
dt = dt.fit(x_train,y_train)
In [15]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import numpy as np
In [16]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [17]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)
In [18]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, iris.target_names, rotation=45)
plt.yticks(tick_marks, iris.target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [19]:
y_pred = dt.fit(x_train, y_train).predict(x_test) #generate a prediction based on the model created to output a predicted y
In [20]:
cm = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [21]:
from sklearn.cross_validation import train_test_split
In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
In [23]:
dt = dt.fit(x_train,y_train)
In [24]:
from sklearn import metrics
import numpy as np
In [25]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [26]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)
In [27]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, iris.target_names, rotation=45)
plt.yticks(tick_marks, iris.target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [28]:
y_pred = dt.fit(x_train, y_train).predict(x_test)
In [29]:
cm = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [30]:
# 75-25 seems to be better at predicting with precision
datasets.load_breast_cancer()
) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [31]:
cancer = datasets.load_breast_cancer()
In [32]:
print(cancer)
In [33]:
cancer.keys()
Out[33]:
In [34]:
#cancer['DESCR']
In [35]:
# we are trying to predict how malignant / benign a specific cancer 'feature' is
cancer['target_names']
Out[35]:
In [36]:
cancer['data']
Out[36]:
In [37]:
cancer['feature_names']
Out[37]:
In [38]:
cancer['feature_names'][11]
Out[38]:
In [39]:
cancer['target']
Out[39]:
In [40]:
x = cancer.data[:,10:11]
print(x)
In [87]:
plt.figure(2, figsize=(8, 6))
plt.scatter(x[:,10:11], x[:,13:14], c=y, cmap=plt.cm.CMRmap)
plt.xlabel('texture error')
plt.ylabel('smoothness error')
plt.axhline(y=56)
plt.axvline(x=0.5)
Out[87]:
In [77]:
plt.figure(2, figsize=(8, 6))
plt.scatter(x[:,1:2], x[:,3:4], c=y, cmap=plt.cm.CMRmap)
plt.xlabel('mean perimeter')
plt.ylabel('mean area')
plt.axhline(y=800)
plt.axvline(x=17)
Out[77]:
In [85]:
plt.figure(2, figsize=(8, 6))
plt.scatter(x[:,5:6], x[:,6:7], c=y, cmap=plt.cm.CMRmap)
plt.xlabel('Mean Concavity')
plt.ylabel('Mean Concave Point')
plt.axhline(y=0.06)
plt.axvline(x=0.25)
Out[85]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [46]:
x = cancer.data[:,10:11] # the attributes of skin color
y = cancer.target
In [47]:
dt = tree.DecisionTreeClassifier()
In [48]:
dt = dt.fit(x,y)
In [49]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
In [50]:
dt = dt.fit(x_train,y_train)
In [51]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [52]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)
In [53]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, cancer.target_names, rotation=45)
plt.yticks(tick_marks, cancer.target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [54]:
y_pred = dt.fit(x_train, y_train).predict(x_test)
In [55]:
cm = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
Predicted 216 for benign but only 54 is true,predicted 50 but there are 107 cases, so this model doesnt work.
In [56]:
x = cancer.data[:,:] # the attributes of skin color
y = cancer.target
In [57]:
dt = tree.DecisionTreeClassifier()
In [58]:
dt = dt.fit(x,y)
In [59]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
In [60]:
dt = dt.fit(x_train,y_train)
In [61]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [62]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)
In [63]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, cancer.target_names, rotation=45)
plt.yticks(tick_marks, cancer.target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [64]:
y_pred = dt.fit(x_train, y_train).predict(x_test)
In [65]:
cm = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
predicted 250 for benign but only 19 were true. predicted 11 but there are 147 cases, so this model doesnt work.
In [ ]:
In [ ]:
In [ ]:
In [ ]: