The purpose of this task is to examine needed steps of preprocessing and the usefulness to execute these steps within a pipeline to simplify the general processing without the need to execute each step manually with a transforming dataset
In [15]:
# a) Import packages declared in ueb01 excercise 4
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
In [29]:
# task a load breast cancer wisconsin dataset as csv after recognizing csv format ;-) See ueb01 exercise 01
# Replace header with numerical values for slicing
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data')
original_column_header = list(data)
# remove header from data and reindex axis
data.columns = range(data.shape[1])
In [30]:
# examining data
#data.tail(15)
data.head(15)
Out[30]:
In [31]:
# Examining shape of data for slicing
data.shape
Out[31]:
In [32]:
# Instantiating LabelEncoder and slicing of data into values and classification
label_encoder = LabelEncoder()
X = data.loc[:, 2:].values
Y = data.loc[:, 1].values
# b) encoding Malign (M) and Bengin (B) into 1 and 0
Y = label_encoder.fit_transform(Y)
In [33]:
# Check correct encoding
Y
Out[33]:
In [34]:
# c) Divide dataset into training and test data (80% training and 20% test data)
test_size = 0.20
rand_state = 1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=rand_state)
In [35]:
# d) + e) Building pipeline of transformer and estimators to calculate accuracy
standard_scaler = ('ss',StandardScaler())
pca = ('pcs', PCA(n_components=2))
logistic_regression = ('lr', LogisticRegression(random_state=rand_state))
pipeline = Pipeline([standard_scaler, pca, logistic_regression])
pipeline.fit(X_train, Y_train)
print('Accuracy: %.3f' % pipeline.score(X_test, Y_test))
In [36]:
# f) Switching PCA for RFECV to determine valuable feature selections and estimating max accuracy instead of the PCA step
# Using formerly intruduces LogisticRegression classificator as estimator
lr = logistic_regression[1]
selector = RFECV(lr, step=1)
selector.fit(X_train, Y_train)
Out[36]:
In [37]:
selector.support_
Out[37]:
In [38]:
selector.ranking_
Out[38]:
In [43]:
standard_scaler = ('ss',StandardScaler())
# pca = ('pcs', PCA(n_components=2))
logistic_regression = ('lr', LogisticRegression(random_state=rand_state))
rfecv = RFECV(logistic_regression[1], step=1, scoring='accuracy')
selector = ('sel', rfecv)
pipeline = Pipeline([standard_scaler, selector, logistic_regression])
pipeline.fit(X_train, Y_train)
print('Accuracy: %.3f' % pipeline.score(X_test, Y_test))
Now the results of RFECV are analysed
In [53]:
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, "ro")
plt.xlabel("selected features")
plt.ylabel("accuracy")
plt.show()
print("Highest accuracy is achieved with: %s features" % rfecv.n_features_)
print("From the given 31 features named as in the original dataset:\n")
i = 0
features = [original_column_header[i] for i, v in enumerate(rfecv.support_) if rfecv.support_[i]]
print(features)
accuracy = pipeline.score(X_test, Y_test)
print("\nThe pipeline reaches with RFECV a maximum accuracy of:", accuracy)
In [ ]: