In [ ]:
# Import pandas and numpy
import pandas as pd
import numpy as np
# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Import train/test split function
from sklearn.model_selection import train_test_split
# Import cross validation scorer
from sklearn.model_selection import cross_val_score
# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score
In [ ]:
# Read in our dataset, using the parameter 'index_col' to select the index
df = pd.read_csv('../data/breast_cancer.csv', index_col='id')
In [ ]:
df.head()
In [ ]:
df.shape
In [ ]:
# Remove the target from the features
features = df.drop(['diagnosis'], axis=1)
# Select the target
target = df['diagnosis']
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)
In [ ]:
# Choose the model
tree_model = DecisionTreeClassifier(random_state=0)
# Fit the model
tree_model.fit(X_train, y_train)
# Make the predictions
y_pred = tree_model.predict_proba(X_test)
# Score the predictions
score = roc_auc_score(y_test, y_pred[:,1])
print("ROC AUC: " + str(score))
print("Number of mislabeled points out of a total %d points: %d" % (y_test.shape[0],(y_test != np.round_(y_pred[:,1])).sum()))
In [ ]:
# Choose the classifer
tree_model = DecisionTreeClassifier(random_state=0)
# Fit, predict and score in one step!
# The arguments, in order:
#1. Model
#2. Features
#3. Target
#4. Number of k-folds
#5. Scoring function
#6. Number of CPU cores to use
score_tree_model = cross_val_score(tree_model, features, target, cv=5, scoring='roc_auc', n_jobs=-1)
print("ROC AUC scores: " + str(score_tree_model))
print("Average ROC AUC: " + str(score_tree_model.mean()))
In [ ]: