Imports



In [ ]:

    
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import cross validation scorer
from sklearn.model_selection import cross_val_score

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

Read the data

This a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.

"diagnosis" is our target: 0 for benign, 1 for malignant.



In [ ]:

    
# Read in our dataset, using the parameter 'index_col' to select the index
df =  pd.read_csv('../data/breast_cancer.csv', index_col='id')



In [ ]:

    
df.head()



In [ ]:

    
df.shape



In [ ]:

    
# Remove the target from the features
features = df.drop(['diagnosis'], axis=1)
# Select the target
target = df['diagnosis']

Train/test split



In [ ]:

    
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

Modelling with standard train/test split



In [ ]:

    
# Choose the model
tree_model = DecisionTreeClassifier(random_state=0)

# Fit the model
tree_model.fit(X_train, y_train)

# Make the predictions
y_pred = tree_model.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, y_pred[:,1])

print("ROC AUC: " + str(score))

print("Number of mislabeled points out of a total %d points: %d" % (y_test.shape[0],(y_test != np.round_(y_pred[:,1])).sum()))

Modelling with k-fold cross validation



In [ ]:

    
# Choose the classifer
tree_model = DecisionTreeClassifier(random_state=0)

# Fit, predict and score in one step!
# The arguments, in order: 
#1. Model 
#2. Features
#3. Target
#4. Number of k-folds
#5. Scoring function
#6. Number of CPU cores to use
score_tree_model = cross_val_score(tree_model, features, target, cv=5, scoring='roc_auc', n_jobs=-1)

print("ROC AUC scores: " + str(score_tree_model))
print("Average ROC AUC: " + str(score_tree_model.mean()))



In [ ]: