Python Machine Learning - Code Examples

Chapter 7 - Combining Different Models for Ensemble Learning

Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).


In [3]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scipy,sklearn


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-3-892237394af8> in <module>()
----> 1 get_ipython().magic('load_ext watermark')
      2 get_ipython().magic("watermark -a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scipy,sklearn")

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in magic(self, arg_s)
   2156         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2157         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158         return self.run_line_magic(magic_name, magic_arg_s)
   2159 
   2160     #-------------------------------------------------------------------------

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_line_magic(self, magic_name, line)
   2077                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2078             with self.builtin_trap:
-> 2079                 result = fn(*args,**kwargs)
   2080             return result
   2081 

<decorator-gen-63> in load_ext(self, module_str)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\magic.py in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\magics\extension.py in load_ext(self, module_str)
     35         if not module_str:
     36             raise UsageError('Missing module name.')
---> 37         res = self.shell.extension_manager.load_extension(module_str)
     38 
     39         if res == 'already loaded':

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\extensions.py in load_extension(self, module_str)
     81             if module_str not in sys.modules:
     82                 with prepended_to_syspath(self.ipython_extension_dir):
---> 83                     __import__(module_str)
     84             mod = sys.modules[module_str]
     85             if self._call_load_ipython_extension(mod):

ModuleNotFoundError: No module named 'watermark'

The use of watermark is optional. You can install this IPython extension via "pip install watermark". For more information, please see: https://github.com/rasbt/watermark.



Overview




In [4]:
from IPython.display import Image
%matplotlib inline

In [5]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

Learning with ensembles


In [6]:
Image(filename='./images/07_01.png', width=500)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-6-c437318b0358> in <module>()
----> 1 Image(filename='./images/07_01.png', width=500)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_01.png'

In [7]:
Image(filename='./images/07_02.png', width=500)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-7-15e287fa4e4f> in <module>()
----> 1 Image(filename='./images/07_02.png', width=500)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_02.png'

In [8]:
from scipy.misc import comb
import math

def ensemble_error(n_classifier, error):
    k_start = math.ceil(n_classifier / 2.0)
    probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
             for k in range(k_start, n_classifier + 1)]
    return sum(probs)

Note

For historical reasons, Python 2.7's math.ceil returns a float instead of an integer like in Python 3.x. Although Although this book was written for Python >3.4, let's make it compatible to Python 2.7 by casting it to an it explicitely:


In [9]:
from scipy.misc import comb
import math

def ensemble_error(n_classifier, error):
    k_start = int(math.ceil(n_classifier / 2.0))
    probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
             for k in range(k_start, n_classifier + 1)]
    return sum(probs)

In [10]:
ensemble_error(n_classifier=11, error=0.25)


Out[10]:
0.034327507019042969

In [11]:
import numpy as np

error_range = np.arange(0.0, 1.01, 0.01)
ens_errors = [ensemble_error(n_classifier=11, error=error)
              for error in error_range]

In [12]:
import matplotlib.pyplot as plt

plt.plot(error_range, 
         ens_errors, 
         label='Ensemble error', 
         linewidth=2)

plt.plot(error_range, 
         error_range, 
         linestyle='--',
         label='Base error',
         linewidth=2)

plt.xlabel('Base error')
plt.ylabel('Base/Ensemble error')
plt.legend(loc='upper left')
plt.grid()
plt.tight_layout()
# plt.savefig('./figures/ensemble_err.png', dpi=300)
plt.show()




Implementing a simple majority vote classifier


In [13]:
import numpy as np

np.argmax(np.bincount([0, 0, 1], 
                      weights=[0.2, 0.2, 0.6]))


Out[13]:
1

In [14]:
ex = np.array([[0.9, 0.1],
               [0.8, 0.2],
               [0.4, 0.6]])

p = np.average(ex, 
               axis=0, 
               weights=[0.2, 0.2, 0.6])
p


Out[14]:
array([ 0.58,  0.42])

In [15]:
np.argmax(p)


Out[15]:
0

In [16]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class MajorityVoteClassifier(BaseEstimator, 
                             ClassifierMixin):
    """ A majority vote ensemble classifier

    Parameters
    ----------
    classifiers : array-like, shape = [n_classifiers]
      Different classifiers for the ensemble

    vote : str, {'classlabel', 'probability'} (default='label')
      If 'classlabel' the prediction is based on the argmax of
        class labels. Else if 'probability', the argmax of
        the sum of probabilities is used to predict the class label
        (recommended for calibrated classifiers).

    weights : array-like, shape = [n_classifiers], optional (default=None)
      If a list of `int` or `float` values are provided, the classifiers
      are weighted by importance; Uses uniform weights if `weights=None`.

    """
    def __init__(self, classifiers, vote='classlabel', weights=None):

        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        """ Fit classifiers.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        y : array-like, shape = [n_samples]
            Vector of target class labels.

        Returns
        -------
        self : object

        """
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        # Use LabelEncoder to ensure class labels start with 0, which
        # is important for np.argmax call in self.predict
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        Returns
        ----------
        maj_vote : array-like, shape = [n_samples]
            Predicted class labels.
            
        """
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in self.classifiers_]).T

            maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        """ Predict class probabilities for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        avg_proba : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.

        """
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out



Combining different algorithms for classification with majority vote


In [17]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
       train_test_split(X, y, 
                        test_size=0.5, 
                        random_state=1)

In [18]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import cross_val_score
else:
    from sklearn.model_selection import cross_val_score

clf1 = LogisticRegression(penalty='l2', 
                          C=0.001,
                          random_state=0)

clf2 = DecisionTreeClassifier(max_depth=1,
                              criterion='entropy',
                              random_state=0)

clf3 = KNeighborsClassifier(n_neighbors=1,
                            p=2,
                            metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))


10-fold cross validation:

ROC AUC: 0.92 (+/- 0.20) [Logistic Regression]
ROC AUC: 0.92 (+/- 0.15) [Decision Tree]
ROC AUC: 0.93 (+/- 0.10) [KNN]

In [19]:
# Majority Rule (hard) Voting

mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])

clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))


ROC AUC: 0.92 (+/- 0.20) [Logistic Regression]
ROC AUC: 0.92 (+/- 0.15) [Decision Tree]
ROC AUC: 0.93 (+/- 0.10) [KNN]
ROC AUC: 0.97 (+/- 0.10) [Majority Voting]



Evaluating and tuning the ensemble classifier


In [20]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls \
        in zip(all_clf,
               clf_labels, colors, linestyles):

    # assuming the label of the positive class is 1
    y_pred = clf.fit(X_train,
                     y_train).predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true=y_test,
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)
    plt.plot(fpr, tpr,
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# plt.tight_layout()
# plt.savefig('./figures/roc.png', dpi=300)
plt.show()



In [21]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)

In [22]:
from itertools import product

all_clf = [pipe1, clf2, pipe3, mv_clf]

x_min = X_train_std[:, 0].min() - 1
x_max = X_train_std[:, 0].max() + 1
y_min = X_train_std[:, 1].min() - 1
y_max = X_train_std[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(nrows=2, ncols=2, 
                        sharex='col', 
                        sharey='row', 
                        figsize=(7, 5))

for idx, clf, tt in zip(product([0, 1], [0, 1]),
                        all_clf, clf_labels):
    clf.fit(X_train_std, y_train)
    
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3)
    
    axarr[idx[0], idx[1]].scatter(X_train_std[y_train==0, 0], 
                                  X_train_std[y_train==0, 1], 
                                  c='blue', 
                                  marker='^',
                                  s=50)
    
    axarr[idx[0], idx[1]].scatter(X_train_std[y_train==1, 0], 
                                  X_train_std[y_train==1, 1], 
                                  c='red', 
                                  marker='o',
                                  s=50)
    
    axarr[idx[0], idx[1]].set_title(tt)

plt.text(-3.5, -4.5, 
         s='Sepal width [standardized]', 
         ha='center', va='center', fontsize=12)
plt.text(-10.5, 4.5, 
         s='Petal length [standardized]', 
         ha='center', va='center', 
         fontsize=12, rotation=90)

plt.tight_layout()
# plt.savefig('./figures/voting_panel', bbox_inches='tight', dpi=300)
plt.show()



In [23]:
mv_clf.get_params()


Out[23]:
{'decisiontreeclassifier': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=0, splitter='best'),
 'decisiontreeclassifier__class_weight': None,
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 1,
 'decisiontreeclassifier__max_features': None,
 'decisiontreeclassifier__max_leaf_nodes': None,
 'decisiontreeclassifier__min_impurity_split': 1e-07,
 'decisiontreeclassifier__min_samples_leaf': 1,
 'decisiontreeclassifier__min_samples_split': 2,
 'decisiontreeclassifier__min_weight_fraction_leaf': 0.0,
 'decisiontreeclassifier__presort': False,
 'decisiontreeclassifier__random_state': 0,
 'decisiontreeclassifier__splitter': 'best',
 'pipeline-1': Pipeline(steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]]),
 'pipeline-1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'pipeline-1__clf__C': 0.001,
 'pipeline-1__clf__class_weight': None,
 'pipeline-1__clf__dual': False,
 'pipeline-1__clf__fit_intercept': True,
 'pipeline-1__clf__intercept_scaling': 1,
 'pipeline-1__clf__max_iter': 100,
 'pipeline-1__clf__multi_class': 'ovr',
 'pipeline-1__clf__n_jobs': 1,
 'pipeline-1__clf__penalty': 'l2',
 'pipeline-1__clf__random_state': 0,
 'pipeline-1__clf__solver': 'liblinear',
 'pipeline-1__clf__tol': 0.0001,
 'pipeline-1__clf__verbose': 0,
 'pipeline-1__clf__warm_start': False,
 'pipeline-1__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipeline-1__sc__copy': True,
 'pipeline-1__sc__with_mean': True,
 'pipeline-1__sc__with_std': True,
 'pipeline-1__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False)]],
 'pipeline-2': Pipeline(steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform')]]),
 'pipeline-2__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform'),
 'pipeline-2__clf__algorithm': 'auto',
 'pipeline-2__clf__leaf_size': 30,
 'pipeline-2__clf__metric': 'minkowski',
 'pipeline-2__clf__metric_params': None,
 'pipeline-2__clf__n_jobs': 1,
 'pipeline-2__clf__n_neighbors': 1,
 'pipeline-2__clf__p': 2,
 'pipeline-2__clf__weights': 'uniform',
 'pipeline-2__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipeline-2__sc__copy': True,
 'pipeline-2__sc__with_mean': True,
 'pipeline-2__sc__with_std': True,
 'pipeline-2__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=1, p=2,
              weights='uniform')]]}

In [24]:
# if Version(sklearn_version) < '0.18':
#     from sklearn.cross_validation import GridSearchCV
# else:
from sklearn.model_selection import GridSearchCV

params = {'decisiontreeclassifier__max_depth': [1, 2],
          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}

grid = GridSearchCV(estimator=mv_clf,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)


if Version(sklearn_version) < '0.18':
    for params, mean_score, scores in grid.grid_scores_:
        print("%0.3f +/- %0.2f %r"
              % (mean_score, scores.std() / 2.0, params))

else:
    cv_keys = ('mean_test_score', 'std_test_score','params')

    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print("%0.3f +/- %0.2f %r"
              % (grid.cv_results_[cv_keys[0]][r], 
                 grid.cv_results_[cv_keys[1]][r] / 2.0, 
                 grid.cv_results_[cv_keys[2]][r]))


0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 0.001}
0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 0.1}
1.000 +/- 0.00 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 100.0}
0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 0.001}
0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 0.1}
1.000 +/- 0.00 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 100.0}

In [25]:
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)


Best parameters: {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 100.0}
Accuracy: 1.00

Note
By default, the default setting for refit in GridSearchCV is True (i.e., GridSeachCV(..., refit=True)), which means that we can use the fitted GridSearchCV estimator to make predictions via the predict method, for example:

grid = GridSearchCV(estimator=mv_clf, 
                    param_grid=params, 
                    cv=10, 
                    scoring='roc_auc')
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

In addition, the "best" estimator can directly be accessed via the best_estimator_ attribute.


In [26]:
grid.best_estimator_.classifiers


Out[26]:
[Pipeline(steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]]),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=0, splitter='best'),
 Pipeline(steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform')]])]

In [27]:
mv_clf = grid.best_estimator_

In [28]:
mv_clf.set_params(**grid.best_estimator_.get_params())


Out[28]:
MajorityVoteClassifier(classifiers=[Pipeline(steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solv...ski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform'))])],
            vote='classlabel', weights=None)

In [29]:
mv_clf


Out[29]:
MajorityVoteClassifier(classifiers=[Pipeline(steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solv...ski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform'))])],
            vote='classlabel', weights=None)



Bagging -- Building an ensemble of classifiers from bootstrap samples


In [30]:
Image(filename='./images/07_06.png', width=500)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-30-dc4766b7ef54> in <module>()
----> 1 Image(filename='./images/07_06.png', width=500)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_06.png'

In [31]:
Image(filename='./images/07_07.png', width=400)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-31-37895d1d48db> in <module>()
----> 1 Image(filename='./images/07_07.png', width=400)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_07.png'

In [32]:
import pandas as pd

df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
                      'machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

# drop 1 class
df_wine = df_wine[df_wine['Class label'] != 1]

y = df_wine['Class label'].values
X = df_wine[['Alcohol', 'Hue']].values

In [33]:
from sklearn.preprocessing import LabelEncoder
if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split


le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
            train_test_split(X, y, 
                             test_size=0.40, 
                             random_state=1)

In [34]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None,
                              random_state=1)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

In [35]:
from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))


Decision tree train/test accuracies 1.000/0.833
Bagging train/test accuracies 1.000/0.896

In [36]:
import numpy as np
import matplotlib.pyplot as plt

x_min = X_train[:, 0].min() - 1
x_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(nrows=1, ncols=2, 
                        sharex='col', 
                        sharey='row', 
                        figsize=(8, 3))


for idx, clf, tt in zip([0, 1],
                        [tree, bag],
                        ['Decision Tree', 'Bagging']):
    clf.fit(X_train, y_train)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx].contourf(xx, yy, Z, alpha=0.3)
    axarr[idx].scatter(X_train[y_train == 0, 0],
                       X_train[y_train == 0, 1],
                       c='blue', marker='^')

    axarr[idx].scatter(X_train[y_train == 1, 0],
                       X_train[y_train == 1, 1],
                       c='red', marker='o')

    axarr[idx].set_title(tt)

axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -1.2,
         s='Hue',
         ha='center', va='center', fontsize=12)

plt.tight_layout()
# plt.savefig('./figures/bagging_region.png',
#            dpi=300,
#            bbox_inches='tight')
plt.show()




Leveraging weak learners via adaptive boosting


In [37]:
Image(filename='./images/07_09.png', width=400)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-37-c033a6f3415d> in <module>()
----> 1 Image(filename='./images/07_09.png', width=400)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_09.png'

In [38]:
Image(filename='./images/07_10.png', width=500)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-38-143750ed8f8b> in <module>()
----> 1 Image(filename='./images/07_10.png', width=500)

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename, format, embed, width, height, retina, unconfined, metadata)
    756         self.unconfined = unconfined
    757         self.metadata = metadata
--> 758         super(Image, self).__init__(data=data, url=url, filename=filename)
    759 
    760         if retina:

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in __init__(self, data, url, filename)
    392         self.filename = None if filename is None else unicode_type(filename)
    393 
--> 394         self.reload()
    395         self._check_data()
    396 

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    778         """Reload the raw data from file or URL."""
    779         if self.embed:
--> 780             super(Image,self).reload()
    781             if self.retina:
    782                 self._retina_shape()

C:\Users\lishunw.AUTH\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\display.py in reload(self)
    410         """Reload the raw data from file or URL."""
    411         if self.filename is not None:
--> 412             with open(self.filename, self._read_flags) as f:
    413                 self.data = f.read()
    414         elif self.url is not None:

FileNotFoundError: [Errno 2] No such file or directory: './images/07_10.png'

In [39]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=0)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

In [40]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))


Decision tree train/test accuracies 0.845/0.854
AdaBoost train/test accuracies 1.000/0.875

In [41]:
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3))


for idx, clf, tt in zip([0, 1],
                        [tree, ada],
                        ['Decision Tree', 'AdaBoost']):
    clf.fit(X_train, y_train)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx].contourf(xx, yy, Z, alpha=0.3)
    axarr[idx].scatter(X_train[y_train == 0, 0],
                       X_train[y_train == 0, 1],
                       c='blue', marker='^')
    axarr[idx].scatter(X_train[y_train == 1, 0],
                       X_train[y_train == 1, 1],
                       c='red', marker='o')
    axarr[idx].set_title(tt)

axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -1.2,
         s='Hue',
         ha='center', va='center', fontsize=12)

plt.tight_layout()
# plt.savefig('./figures/adaboost_region.png',
#           dpi=300,
#           bbox_inches='tight')
plt.show()




Summary

...