In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,cross_val_score, KFold, cross_val_predict
from sklearn.decomposition import PCA as sklearn_pca
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing, decomposition
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model.stochastic_gradient import SGDClassifier
In [3]:
# Read and import data
data = pd.read_csv('breastcancerdata.csv')
data.head()
Out[3]:
In [4]:
#Check the columns in the raw data
data.columns
Out[4]:
In [5]:
#Check the kind of variables in the raw data
data.info()
In [6]:
#deleting the "id" column
data.drop("id",axis=1,inplace=True)
#deleting the "Unnamed: 32" column
data.drop("Unnamed: 32",axis=1,inplace=True)
In [7]:
#Check variables type after deleting the ones we are not using
data.info()
In [8]:
#counting the diagnosis variable
data.diagnosis.value_counts()
Out[8]:
In [9]:
#Transform classifying variable into numeric variable [0,1] and add a column
data.loc[data['diagnosis'] == 'M', 'Diagclass'] = 1
data.loc[data['diagnosis'] == 'B', 'Diagclass'] = 0
#Check dataset
data.head()
Out[9]:
In [10]:
#Dataset with new classifying variable "Diagclass" and withouth the diagnosis column
data.drop("diagnosis",axis=1,inplace=True)
In [11]:
#counting the diagnosis variable
data.Diagclass.value_counts()
Out[11]:
In [12]:
#UPsample the minority class
# Separate majority and minority classes
Diagclass_majority = data[data.Diagclass==0]
Diagclass_minority = data[data.Diagclass==1]
# Downsample Diaclass majority
Diagclass_majority_downsampled = resample(Diagclass_majority, replace=False, n_samples=212, random_state=123)
# Combine majority class with downsampled najority class
data1 = pd.concat([Diagclass_majority_downsampled, Diagclass_minority])
# Display new class counts
data1.Diagclass.value_counts()
Out[12]:
In [13]:
#Define predictors and predicted variables
X = data1.drop('Diagclass', axis = 1)
Y = data1['Diagclass']
In [14]:
#Preprocess and scale data
names = X.columns
X1 = pd.DataFrame(preprocessing.scale(X), columns = names)
X1.head(2)
Out[14]:
PCA Analysis
In [16]:
# Build up the correlation mtrix
Z = X1
correlation_matrix = Z.corr()
#Eigenvectores & Eigenvalues
eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)
sklearn_pca = PCA(n_components=len(Z.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)
#From the Scree plot.
plt.plot(eig_vals)
plt.show()
print(
'The percentage of total variance in the dataset explained by each',
'component from Sklearn PCA.\n',
sklearn_pca.explained_variance_ratio_
)
In [17]:
#PCA features
# Create a scaler object
sc = StandardScaler()
# Fit the scaler to the features and transform
X_std = sc.fit_transform(X1)
# Create a PCA object from Scree plot the number of components is 3
pca = decomposition.PCA(n_components=3)
# Fit the PCA and transform the data
X_std_pca = pca.fit_transform(X_std)
# View the new feature data's shape
X_std_pca.shape
# Create a new dataframe with the new features
XPCA = pd.DataFrame(X_std_pca)
XPCA.head()
Out[17]:
In [18]:
#Calculate Feature Importance using Random Forest
rf = RandomForestClassifier()
rf.fit(X1, Y)
#Define feature importance
feature_importance = rf.feature_importances_
# Make importances relative to max importance.
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(7, 30))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X1.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Diagclass')
plt.show()
In [19]:
#Feature Selection. Scores for the most relevant features (should we start with the one that has more explanatory power)
# feature extraction
test = SelectKBest()
fit = test.fit(X1, Y)
#Identify features with highest score from a predictive perspective (for all programs)
names2 = X1.columns
Bestfeatures = pd.DataFrame(fit.scores_, index = names2)
Bestfeatures.columns = ['Best Features']
Bestfeatures.sort_values(by=['Best Features'], ascending=False)
Out[19]:
In [20]:
# create the RFE model and select features
#From PCA analyis the number of components is 3
nfeatures = 3
lr = LogisticRegression()
rfe = RFE(lr,nfeatures)
fit = rfe.fit(X1,Y)
# summarize the selection of the features
result_RFE = pd.DataFrame(list(zip(X1.head(0), rfe.ranking_, rfe.support_)),columns=['Features','Ranking','Support'] )
result_RFE.sort_values('Ranking')
Out[20]:
Feature Selection
In [21]:
#View all the predictors to make the feature selection
X1.columns
Out[21]:
In [22]:
#Feature Selection using Random Forest
X3 = X1[['perimeter_worst', 'area_worst', 'concave points_mean', 'concavity_mean','radius_worst','perimeter_mean',
'concavity_worst', 'compactness_mean','concave points_worst','compactness_worst']]
#Feature Selection using RFE & PCA
X2 = X1[['radius_worst','concave points_worst','perimeter_worst']]
In [23]:
#Split the data into training and testing datasets. Split: 70/30; train/test
X_train, X_test, y_train, y_test = train_test_split(X2,Y, test_size=0.3, random_state=123)
#Initiating the cross validation generator, N splits = 5
kf = KFold(5)
Logistic Regression
In [24]:
# Initialize and fit the model.
lr = LogisticRegression()
#Tune parameters
k1 = np.arange(20)+1
k2 = ['l1','l2']
parameters = {'C': k1,
'penalty':k2
}
#Fit parameters
lr1 = GridSearchCV(lr, param_grid=parameters, cv=kf)
#Fit the tunned classifier in the traiing space
lr1.fit(X_train, y_train)
#Print the best parameters
print(lr1.best_params_)
In [25]:
#Have a raw idea of the accuracy of each of the feeatures selection carried out with different methodologies
lr1.fit(XPCA, Y)
# Predict on test set
predPCA_y = lr1.predict(XPCA)
print((
'PCA accuracy: {}\n'
'RFE accuracy: {}\n'
'FI accuracy: {}\n'
).format(cross_val_score(lr1,XPCA,Y,cv=kf).mean(),cross_val_score(lr,X2,Y,cv=kf).mean(),cross_val_score(lr,X3,Y,cv=kf).mean()))
In [26]:
#Fit on Test set
lr1.fit(X_test, y_test)
predtest_y = lr1.predict(X_test)
In [27]:
#Evaluate model (test set)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtest_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
acclr1 = cross_val_score(lr1,X_test,y_test,cv=kf).mean()
acclr1pca = cross_val_score(lr1,XPCA,Y,cv=kf).mean()
print((
'Logistic Regression accuracy: {}\n'
'Logistic Regression accuracy PCA: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(acclr1,acclr1pca,test_tI_errors, test_tII_errors))
Naive Bayes
In [28]:
# Initialize and fit the model.
lb = BernoulliNB()
#Tune parameters
k1 = np.arange(10)+1
parameters = {'alpha': k1}
#Fit parameters
lb1 = GridSearchCV(lb, param_grid=parameters, cv=kf)
#Fit the tunned classifier in the traiing space
lb1.fit(X_train, y_train)
#Print the best parameters
print(lb1.best_params_)
In [29]:
# Predict on the test data set
lb1.fit(X_test, y_test)
# Predict on training set
predtestlb_y = lb1.predict(X_test)
In [30]:
#Evaluation of the model (testing)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtestlb_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtestlb_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtestlb_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
acclb1 = cross_val_score(lb1,X_test,y_test,cv=kf).mean()
acclb1pca = cross_val_score(lb1,XPCA, Y,cv=kf).mean()
print((
'Naive Bayes accuracy: {}\n'
'Naive Bayes accuracy PCA: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(acclb1,acclb1pca,test_tI_errors, test_tII_errors))
KNN
In [31]:
# Initialize and fit the model
KNN = KNeighborsClassifier(n_jobs=-1)
#Create range of values to fit parameters
k1 = [11,13,15,17,19,21]
k2 = [40,50,60]
k3 = ['uniform', 'distance']
k4 = ['auto', 'ball_tree','kd_tree','brute']
parameters = {'n_neighbors': k1,
'leaf_size': k2,
'weights':k3,
'algorithm':k4}
#Fit parameters
clf = GridSearchCV(KNN, param_grid=parameters, cv=kf)
#Fit the tunned model
clf.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", clf.best_params_)
In [32]:
#Initialize the model on test dataset
clf.fit(X_test, y_test)
# Predict on test dataset
predtest3_y = clf.predict(X_test)
In [33]:
#Evaluate model on the test set
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest3_y, target_names=target_names))
#Create confusion matrix
confusion = confusion_matrix(y_test, predtest3_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest3_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
accclf = cross_val_score(clf,X_test,y_test,cv=kf).mean()
accclfpca = cross_val_score(clf,XPCA,Y,cv=kf).mean()
#Print Results
print((
'KNN accuracy: {}\n'
'KNN accuracy PCA: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(accclf,accclfpca,test_tI_errors, test_tII_errors))
Random Forest
In [34]:
# Initialize the model
rf = RandomForestClassifier(n_jobs = -1)
#Create range of values to fit parameters
k1 = [20,100,300]
parameters = {'n_estimators':k1}
#Fit parameters
rf1 = GridSearchCV(rf, param_grid=parameters, cv=kf)
#Fit the tunned model
rf1.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", rf1.best_params_)
In [35]:
#Fit in test dataset
rf1.fit(X_test, y_test)
#Predict on test dataset
predtestrf_y = rf1.predict(X_test)
In [36]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestrf_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
accrf1 = cross_val_score(rf1,X_test,y_test,cv=kf).mean()
accrf1pca = cross_val_score(rf1,XPCA,Y,cv=kf).mean()
print((
'Random Forest accuracy:{}\n'
'Random Forest accuracy PCA:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(accrf1,accrf1pca,test_tI_errors, test_tII_errors))
Decision Tree
In [37]:
# Train model
OTM = DecisionTreeClassifier()
#Create range of values to fit parameters
k1 = ['auto', 'sqrt', 'log2']
parameters = {'max_features': k1
}
#Fit parameters
OTM1 = GridSearchCV(OTM, param_grid=parameters, cv=kf)
#Fit the tunned model
OTM1.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", OTM1.best_params_)
In [38]:
#Fit on test dataset
OTM1.fit(X_test, y_test)
#Predict parameters on test dataset
predtestrf1_y = OTM1.predict(X_test)
In [39]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf1_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf1_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestrf1_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
OTM1acc = cross_val_score(OTM1,X_test,y_test,cv=kf).mean()
OTM1accpca = cross_val_score(OTM1,XPCA,Y,cv=kf).mean()
print((
'Decision Tree accuracy:{}\n'
'Decision Tree accuracy PCA:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(OTM1acc,OTM1accpca, test_tI_errors, test_tII_errors))
SVC
In [40]:
# Train model
svc = SVC()
#Create range of values to fit parameters
k1 = np.arange(20)+1
k2 = ['linear','rbf']
parameters = {'C': k1,
'kernel': k2}
#Fit parameters
svc1 = GridSearchCV(svc, param_grid=parameters, cv=kf)
#Fit the tunned model
svc1.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", svc1.best_params_)
In [41]:
#Fit tunned model on Test set
svc1.fit(X_test, y_test)
# Predict on training set
predtestsvc_y = svc1.predict(X_test)
In [42]:
#Test Scores
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtestsvc_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestsvc_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestsvc_y, margins=True)
accsvc1 = cross_val_score(svc1,X_test,y_test,cv=kf).mean()
accsvc1pca = cross_val_score(svc1,XPCA,Y,cv=kf).mean()
print((
'SVC accuracy:{}\n'
'SVC accuracy PCA:{}\n'
).format(accsvc1,accsvc1pca))
Gradient Boosting
In [43]:
# Train model
GBC = GradientBoostingClassifier()
k1 = ['deviance','exponential']
k2 = np.arange(100)+1
k5 = ['friedman_mse','mse','mae']
parameters = {'loss': k1,
'n_estimators': k2,
'criterion': k5}
#Fit parameters
GBC1 = GridSearchCV(GBC, param_grid=parameters, cv=kf)
#Fit the tunned model
GBC1.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", GBC1.best_params_)
In [44]:
#Fit on the test set
GBC1.fit(X_test, y_test)
# Predict on test set
predtestgb_y = GBC1.predict(X_test)
In [45]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestgb_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestgb_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestgb_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
accGBC1 = cross_val_score(GBC1,X_test,y_test,cv=kf).mean()
accGBC1pca = cross_val_score(GBC1,XPCA,Y,cv=kf).mean()
print((
'Gradient Boosting accuracy:{}\n'
'Gradient Boosting accuracy PCA:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(accGBC1,accGBC1pca,test_tI_errors, test_tII_errors))
Conclusion
Based on the breast Cancer Diagnostic dataset with 569 entries a model has been built to predict based on the most relevant features obtained if a tumor is benign or malign. In this case we have used classifiers to build the models and the hyperparameters have been tuned on the training set, accounting for 70% of all the data and test on the remaining 30%. As a result, all the models tested have an accuracy that goes from 89% in the worst-case scenario up to 95% in the best case. The models used have been logistic regression, KNN, SVC, Random Forest, Naïve Bayes (Bernouilli, Gradient Boosting and Decision Tree). All hyperparameters of the models and the models have been tested using cross validation with five folds. The first step has been to create and select the features that will be the predictors of the model and to build the output variable as a [0,1] variable. In the latter, the dataset has been resampled to balance the number of outputs in each class. For the feature selection, Random forest best features, select best and recursive feature elimination have been used. Additionally is has been compared to the number of features that a PCA analysis gives as meaningful for the model. In the case of the feature selection models used, the features selected using random forest have been narrowed down to the ones produced by the recursive feature elimination using as the maximum number of features the number given by the PCA analysis equating to 3. The selected features are: 'radius_worst','concave points_worst','perimeter_worst'.
In [54]:
#Summary of accuracy of different models:
print(('Accuracy of each model: \n'
'Logistic Regression:{:.{prec}f} \n'
'KNN: {:.{prec}f} \n'
'SVC: {:.{prec}f} \n'
'Random Forest: {:.{prec}f} \n'
'Naive Bayes:{:.{prec}f} \n'
'Gradient Boosting: {:.{prec}f} \n'
'Decision Tree: {:.{prec}f} \n'
).format(acclr1,accclf,accsvc1,accrf1,acclb1,accGBC1,OTM1acc,prec=4))