Excercise 1 Task 4: Preprocesing & Pipelines

The purpose of this task is to examine needed steps of preprocessing and the usefulness to execute these steps within a pipeline to simplify the general processing without the need to execute each step manually with a transforming dataset


In [15]:
# a) Import packages declared in ueb01 excercise 4
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV

In [29]:
# task a load breast cancer wisconsin dataset as csv after recognizing csv format ;-) See ueb01 exercise 01
# Replace header with numerical values for slicing
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data')
original_column_header = list(data)
# remove header from data and reindex axis
data.columns = range(data.shape[1])

In [30]:
# examining data
#data.tail(15)
data.head(15)


Out[30]:
0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31
0 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.18600 0.2750 0.08902
1 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.24300 0.3613 0.08758
2 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.25750 0.6638 0.17300
3 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.16250 0.2364 0.07678
4 843786 M 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 ... 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.17410 0.3985 0.12440
5 844359 M 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 ... 22.88 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.19320 0.3063 0.08368
6 84458202 M 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 ... 17.06 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.15560 0.3196 0.11510
7 844981 M 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 ... 15.49 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.20600 0.4378 0.10720
8 84501001 M 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 ... 15.09 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.22100 0.4366 0.20750
9 845636 M 16.02 23.24 102.70 797.8 0.08206 0.06669 0.03299 0.03323 ... 19.19 33.88 123.80 1150.0 0.1181 0.1551 0.1459 0.09975 0.2948 0.08452
10 84610002 M 15.78 17.89 103.60 781.0 0.09710 0.12920 0.09954 0.06606 ... 20.42 27.28 136.50 1299.0 0.1396 0.5609 0.3965 0.18100 0.3792 0.10480
11 846226 M 19.17 24.80 132.40 1123.0 0.09740 0.24580 0.20650 0.11180 ... 20.96 29.94 151.70 1332.0 0.1037 0.3903 0.3639 0.17670 0.3176 0.10230
12 846381 M 15.85 23.95 103.70 782.7 0.08401 0.10020 0.09938 0.05364 ... 16.84 27.66 112.00 876.5 0.1131 0.1924 0.2322 0.11190 0.2809 0.06287
13 84667401 M 13.73 22.61 93.60 578.3 0.11310 0.22930 0.21280 0.08025 ... 15.03 32.01 108.80 697.7 0.1651 0.7725 0.6943 0.22080 0.3596 0.14310
14 84799002 M 14.54 27.54 96.73 658.8 0.11390 0.15950 0.16390 0.07364 ... 17.46 37.13 124.10 943.2 0.1678 0.6577 0.7026 0.17120 0.4218 0.13410

15 rows × 32 columns


In [31]:
# Examining shape of data for slicing
data.shape


Out[31]:
(568, 32)

In [32]:
# Instantiating LabelEncoder and slicing of data into values and classification
label_encoder = LabelEncoder()
X = data.loc[:, 2:].values
Y = data.loc[:, 1].values

# b) encoding Malign (M) and Bengin (B) into 1 and 0
Y = label_encoder.fit_transform(Y)

In [33]:
# Check correct encoding
Y


Out[33]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0])

In [34]:
# c) Divide dataset into training and test data (80% training  and 20% test data)
test_size = 0.20
rand_state = 1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=rand_state)

In [35]:
# d) + e) Building pipeline of transformer and estimators to calculate accuracy
standard_scaler = ('ss',StandardScaler())
pca = ('pcs', PCA(n_components=2))
logistic_regression = ('lr', LogisticRegression(random_state=rand_state))
pipeline = Pipeline([standard_scaler, pca, logistic_regression])
pipeline.fit(X_train, Y_train)
print('Accuracy: %.3f' % pipeline.score(X_test, Y_test))


Accuracy: 0.956

In [36]:
# f) Switching PCA for RFECV to determine valuable feature selections and estimating max accuracy instead of the PCA step
# Using formerly intruduces LogisticRegression classificator as estimator
lr = logistic_regression[1]
selector = RFECV(lr, step=1)
selector.fit(X_train, Y_train)


Out[36]:
RFECV(cv=None,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring=None, step=1, verbose=0)

In [37]:
selector.support_


Out[37]:
array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True], dtype=bool)

In [38]:
selector.ranking_


Out[38]:
array([1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [43]:
standard_scaler = ('ss',StandardScaler())
# pca = ('pcs', PCA(n_components=2))
logistic_regression = ('lr', LogisticRegression(random_state=rand_state))
rfecv = RFECV(logistic_regression[1], step=1, scoring='accuracy')
selector = ('sel', rfecv)
pipeline = Pipeline([standard_scaler, selector, logistic_regression])
pipeline.fit(X_train, Y_train)
print('Accuracy: %.3f' % pipeline.score(X_test, Y_test))


Accuracy: 0.991

Now the results of RFECV are analysed


In [53]:
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, "ro")
plt.xlabel("selected features")
plt.ylabel("accuracy")
plt.show()
print("Highest accuracy is achieved with: %s features" % rfecv.n_features_)

print("From the given 31 features named as in the original dataset:\n")
i = 0
features = [original_column_header[i] for i, v in enumerate(rfecv.support_) if rfecv.support_[i]]
print(features)
accuracy = pipeline.score(X_test, Y_test)
print("\nThe pipeline reaches with RFECV a maximum accuracy of:", accuracy)


Highest accuracy is achieved with: 24 features
From the given 31 features named as in the original dataset:

['842302', 'M', '10.38', '1001', '0.1184', '0.2776', '0.1471', '0.2419', '0.07871', '1.095', '0.9053', '153.4', '0.04904', '0.05373', '0.01587', '0.03003', '0.006193', '25.38', '17.33', '184.6', '0.1622', '0.6656', '0.7119', '0.2654']

The pipeline reaches with RFECV a maximum accuracy of: 0.991228070175

In [ ]: