In [1]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
In [2]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
dataset = pd.read_csv(url, header=None)
In [3]:
print(dataset.shape)
In [4]:
print(dataset.head())
In [5]:
le = LabelEncoder()
le.fit(dataset[1])
dataset[1] = le.transform(dataset[1])
print(dataset.head(5))
In [6]:
# Split-out validation dataset
array = dataset.values
# columns from 2 on are features
X = array[:,2:]
# first column is label (column 0 is ID)
y = array[:,1]
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
In [67]:
# define pipeline steps
estimators = [('scaling', StandardScaler()),
('reduce_dim', PCA(n_components=2)),
('clf', LogisticRegression(random_state=1))]
# create pipeline out of steps and fit on train data
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
# measure the accuracy on test data
accuracy = pipe.score(X_test, y_test)
print("Accuracy of model with PCA preprocessing measured on a test set: {}".format(accuracy))
In [68]:
# define pipeline steps
estimators2 = [('scaling', StandardScaler()),
('feature_sel', RFECV(LogisticRegression(random_state=1), scoring='accuracy')),
('clf', LogisticRegression(random_state=1))]
# create pipeline out of steps and fit on train data
pipe2 = Pipeline(estimators2)
pipe2.fit(X_train, y_train)
# get access to feature selection step of the pipeline
rfecv = pipe2.named_steps['feature_sel']
#Plot number of features VS. cross-validation scores
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation accuracy")
plt.show()
print("Optimal number of features : {}".format(rfecv.n_features_))
print()
print("Selected features:")
print(["{} ".format(inx+2) for inx, selected in enumerate(rfecv.support_) if selected == True])
print()
# measure the accuracy on test data
accuracy2 = pipe2.score(X_test, y_test)
print("Accuracy of model with RFECV preprocessing measured on a test set: {}".format(accuracy2))
In [36]: