In [1]:
#First, the libraries. And, make sure matplotlib shows up in jupyter notebook! hurrah
import pandas as pd
from sklearn import datasets
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline
In [8]:
iris = datasets.load_iris()
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x,y)
In [2]:
from sklearn.cross_validation import train_test_split
In [94]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5,random_state=42)
In [95]:
dt = dt.fit(x_train,y_train)
In [3]:
from sklearn import metrics
import numpy as np
In [4]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [100]:
measure_performance(x_test,y_test,dt)
In [99]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75,random_state=42)
measure_performance(x_test,y_test,dt)
datasets.load_breast_cancer()) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [5]:
bc=datasets.load_breast_cancer()
print(bc.keys())
print(bc['target_names'])
bc['DESCR']
print(bc['target'])
bc['data']
bc['feature_names']
Out[5]:
In [6]:
df = pd.DataFrame(bc.data, columns= bc.feature_names)
#Okay this does not work because in my data frame I only have the features, not the classes, no way to see the best predictors for the classes. :(
In [9]:
x = bc.data[:,21:] # the attributes. I chose column 22 onward because... they have the word "worst" in them... :/
y = bc.target # the target variable. It has already been dummified. I guess. This dataset is unfriendly
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x,y)
In [10]:
#With a 50/50 split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5,random_state=42)
In [11]:
dt = dt.fit(x_train,y_train)
In [12]:
measure_performance(x_test,y_test,dt)
In [13]:
#With a 75/25 split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75,random_state=42)
dt = dt.fit(x_train,y_train)
measure_performance(x_test,y_test,dt)
In [ ]: