AdaBoost

In this tutorial, we'll build a stump classifier and apply the AdaBoost algorithm. Our goal is to transform a weak classifier into something useful.

This lecture covers the first part of chapter 7 in Peter Harrington's book (Harrington, P. (2012). Machine Learning in Action. Shelter Island, NY: Manning) with some added commentary.

Imports

Running the code below will be comprensive for the tutorial.



In [9]:

    
# base requirements 
from IPython.display import Image
from IPython.display import display
from datetime import *
import json
from copy import *
from pprint import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import rpy2
%load_ext rpy2.ipython
%R require("ggplot2")
% matplotlib inline
from ggplot import *
randn = np.random.randn

# optional
import warnings
warnings.filterwarnings('ignore')

# tutorial requirements 

#bokeh - http://bokeh.pydata.org/en/latest/docs/installation.html 
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
output_notebook() # inline graphs
#import bokeh.sampledata # this download is commented out b/c it's optional
# bokeh.sampledata.download() # this download is commented out b/c it's optional









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython






    




    
        
        
        
    
        
        BokehJS successfully loaded.
    
    Warning: BokehJS previously loaded

Functions

We'll dump the major code base into this section.



In [10]:

    
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
    """
    Performs a threshold comparison to classify data.
    Everything on one side of the threshold is thrown into class -1, 
    and everything on the other side is thrown into class +1.
    """
    retArray = np.ones((np.shape(dataMatrix)[0],1))
    #print "retArray"
    #display(retArray)
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = 1.0
    return retArray

def buildStump(dataArr,classLabels,D):
    """
    Iterates over all of the possible inputs to stumpClassify() and finds 
    the best decision stump for our dataset. Best here will be with respect 
    to the data weight vector D. 
    """
    dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
    #print "dataMatrix:"
    #display(dataMatrix)
    #print "labelMat:"
    #display(labelMat)
    m,n = np.shape(dataMatrix)
    #print ("m:{}, n:{}".format(m,n))
    numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
    #print "bestClasEst:"
    #display(bestClasEst)
    minError = np.inf #init error sum, to +infinity
    #print "minError:"
    #display(minError)
    #The first one goes over all the features in our dataset. You’re 
    #  considering numeric values, and you calculate the minimum and 
    #  maximum to see how large your step size should be. 
    for i in range(n):#loop over all dimensions
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax-rangeMin)/numSteps
        #print "stepSize:{}".format(stepSize)
        # The next for loops loop over these values.
        for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
            #The last for loop toggles your inequality between greater than and less than
            for inequal in ['lt', 'gt']: #go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize) #value at which we make our decision to classify one way or another
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #returns labels for each element
                errArr = np.mat(np.ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                #print "\n\nerrArr:"
                #display(errArr)
                #display(D.T) 
                weightedError = D.T*errArr  #calc total error multiplied by D <---------D is constant in this function but  varied inside AdaBoost
                #print "weightedError:"
                #display(weightedError)
                #####
                ##### uncomment line below for 1st pass
                #####
                #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                if weightedError < minError: #finds thhe best stump
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['feature'] = i+1
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst

def alpha(error):
    return float(0.5*np.log((1.0-error)/max(error,1e-16)))

def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    """
    The implementation of AdaBoost. We get back a set of weak \
    classifiers and weights (the signs of which we use as labels).
    """
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m) #init D to all weights being equal
    aggClassEst = np.mat(np.zeros((m,1))) #init to zero
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)# note: D varies to improve the classifier
        alpha = float(0.5*np.log((1.0-error)/max(error,1e-16)))#calc alpha; note: max(error,eps) accounts for error=0
        bestStump['alpha'] = alpha  
        weakClassArr.append(bestStump) #store Stump Params in Array
        #print "classEst: ",classEst.T
        expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) #exponent for D calc, notice that multiplying \
                                                                     #   np.mat(classLabels).T & classEst is for sign \
                                                                     #   that drives D values to 0 or 1 
        D = np.multiply(D,np.exp(expon)) #Calc New D for next iteration
        D = D/D.sum() # D.sum() normalizes the values as probabilities that all sum to 1
        #calc training error of all classifiers, if this is 0 quit for loop early (use break)
        aggClassEst += alpha*classEst # <----- the magic; this allows the signs (labels) to be pushed around
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1))) # 1's when error
        errorRate = aggErrors.sum()/m # percent error
        print "total error: ",errorRate
        if errorRate == 0.0: break
    return weakClassArr,aggClassEst

def adaClassify(datToClass,classifierArr):
    """
    Given an unknown datum, we label it from training data.
    """
    dataMatrix = np.mat(datToClass)
    m = np.shape(dataMatrix)[0]
    #print "m:{}".format(m)
    aggClassEst = np.mat(np.zeros((m,1))) # predicted values
    #print "initial aggClassEst:{}".format(aggClassEst)
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,classifierArr[i]['feature']-1\
                                 , classifierArr[i]['thresh']\
                                 , classifierArr[i]['ineq'])#call stump classify
        aggClassEst += classifierArr[i]['alpha']*classEst
        
        print aggClassEst
    return np.sign(aggClassEst)

def loadData():
    """
    Loads sample dataset as arrays.
    """
    datMat = np.array([[ 1. ,  2.1],
[2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
    classLabels = np.array([1.0, 1.0, -1.0, -1.0, 1.0])
    return datMat,classLabels

def loadSimpData():
    """
    Loads dataset as matrix.
    """
    datMat = np.matrix([[ 1. ,  2.1],
[2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat,classLabels

def build_simple_bokeh_graph():
    data,labels = loadData() 
    print "data:"
    display(data)
    print "labels:"
    display(labels)
    print "Feature 1 data:"
    d1 = data[(labels==1)]
    display(d1)
    print "Feature 2 data:"
    d2 = data[(labels==-1)]
    display(d2)

    ## set up Bokeh figure
    p = figure(
       tools="pan,box_zoom,reset,save"
        , title="Data: Two Features & Two Classes"
       #y_axis_type="log", y_range=[0.001, 10**11]
        , x_axis_label='Feature 1'
        , y_axis_label='Feature 2'
    )

    ## add data to Bokeh figure
    p.scatter(d1[:,0], d1[:,1], legend="class1", fill_color="red", size=20,marker="circle")
    p.scatter(d2[:,0], d2[:,1], legend="class2", fill_color="blue", size=20,marker="square")

    # display Bokeh figure
    show(p)

def run_stump():
    # run stump classifier without adaboost
    datMat,classLabels=loadSimpData()
    print "Data:"
    display(datMat)
    D = np.mat( np.ones((5,1)) / 5 )
    print "initial D:"
    display(D)
    numSteps = 10.0;
    print "TEST:"
    x,y,z=buildStump(datMat,classLabels,D) # note: D is constant here, but this is the value that we will vary with adaboost.
    print "\n\nRESTULS:"
    print " bestStump:{}".format(x)
    print " smallest error:{}".format(y)
    print " predicted labels:"
    display(z)

def graph_alpha():
    # Create graph of alpha values
    x = np.arange(0.01,1,0.01)
    alpha_calc = np.vectorize(alpha)
    y = alpha_calc(x)

    ## Bokeh output inline
    #output_notebook() 

    ## set up Bokeh figure
    p = figure(
       tools="pan,box_zoom,reset,save"
        , title="How are the classifiers scored?"
       #y_axis_type="log", y_range=[0.001, 10**11]
        , x_axis_label='Error'
        , y_axis_label='Alpha'
    )

    ## add data to Bokeh figure
    p.line(x, y, legend="alpha curve", color="blue", line_width=2)
    # guide line
    a = np.array([.5,.5])
    b = np.array([-1,1])
    p.line(a,b, legend="50% error", color="red",line_width = 1, alpha=0.6, line_dash="4 4")

    # display Bokeh figure
    show(p)

def simple_application():
    datArr,labelArr=loadSimpData()

    print "Building training set."
    classifierArr = adaBoostTrainDS(datArr,labelArr,30)

    print "\nclassifierArr:"
    display(classifierArr[0])

    print "Classification of unknown point:"
    adaClassify([0, 0],classifierArr[0])

What is a decision stump?

Decision trees typically create a path that uses several features to label a dataset. With a stump, we try to pick a single feature in a dataset and use it to label every element. Let's start with an example. We'll create some labeled data.



In [11]:

    
build_simple_bokeh_graph()









    



data:






    





array([[ 1. ,  2.1],
       [ 2. ,  1.1],
       [ 1.3,  1. ],
       [ 1. ,  1. ],
       [ 2. ,  1. ]])






    



labels:






    





array([ 1.,  1., -1., -1.,  1.])






    



Feature 1 data:






    





array([[ 1. ,  2.1],
       [ 2. ,  1.1],
       [ 2. ,  1. ]])






    



Feature 2 data:






    





array([[ 1.3,  1. ],
       [ 1. ,  1. ]])

Which individual feature best helps us classify this dataset? As you might note, we'll always have an error. As such, we could call this method a week classifier.

Let's first see how to build a decision stump, test if any of values are less than or greater than the threshold value we’re testing, and then loop over a weighted version of the dataset to find the stump that yields the lowest error.

One important distinction at this point is that we're using equal weights across all elements in the dataset. Late, we'll use the AdaBoost algorithm to change these weights to optimize the accuracy of the labels.

We now have the ability to choose which point on a specific continuous feature we'll use as the threshold value to label our data. Let's see which value and dimension are selected to choose the best stump.

Use stump classifier w/out AdaBoost



In [12]:

    
run_stump()









    



Data:






    





matrix([[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])






    



initial D:






    





matrix([[ 0.2],
        [ 0.2],
        [ 0.2],
        [ 0.2],
        [ 0.2]])






    



TEST:


RESTULS:
 bestStump:{'ineq': 'lt', 'thresh': 1.3, 'feature': 1}
 smallest error:[[ 0.2]]
 predicted labels:






    





array([[-1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.]])

Implement AdaBoost

After building our stump classifier, we'll try to improve it using AdaBoost. We're going to change one set of values: D, which is a vector of weights. We'll change D through an iterative process. This weight vector adjust for incorrect labels. So we'll change D by evaluating those labels that we classified incorrectly and increasing their weight while simultaneously decreasing the weight on those values that we classify correctly. Initially, all of these weights will be equal, but at each iteration we'll re-evaluate the weights to adjust for failure/success. Hence, each point in the dataset will receive a custom weight depending on how well we classified it in the last iteration.

To calculate alpha, \alpha, we then sum up the weighted errors for each stump. In short, the vector D is varied per stump - each of which is scored with an alpha value.

Before we move on to undersatand how adaboots uses the our sets of alpha values, let's look a little more deeply at what this score means.

We calculate our error rate with \begin{equation*} \epsilon = \frac{number\ of\ incorrectly\ classified\ examples}{total\ number\ of\ examples}\\ \end{equation*} These errors are multiplied by the weights and then the alpha value is calculated as follows: \begin{equation*} \alpha = \frac{1}{2}ln(\frac{1-\epsilon}{\epsilon}) \end{equation*}

Let's look at a graph of alpha values.



In [13]:

    
graph_alpha()

What we can learn from this graph?

(see Chris McCormick's discussion https://chrisjmccormick.wordpress.com/2013/12/13/adaboost-tutorial/)

The classifier weight grows exponentially as the error approaches 0. Better classifiers are given exponentially more weight.
The classifier weight is zero if the error rate is 0.5. A classifier with 50% accuracy is no better than random guessing, so we ignore it.
The classifier weight grows exponentially negative as the error approaches 1. We give a negative weight to classifiers with worse worse than 50% accuracy. “Whatever that classifier says, do the opposite!”.

We end up using alpha through a series of iterations that drive the labeling error closer to zero. The way this works is that we sum together the product of alpha and each stump's predicted values, which provides a vector of floats whose signs indicate our labels.

We now understand that alpha relates to the sum of errors and is in some way associated with how much to weight each stump. Now we just need to understand how alpha (\alpha) relates to the individualized weights in vector D:

Correctly predicted, \begin{equation*} D_{i}^{(t+1)}= \frac{D_{i}^{(t)}e^{-\alpha}}{Sum(D)}\\ \end{equation*} Incorrectly predicted, \begin{equation*} D_{i}^{(t+1)}= \frac{D_{i}^{(t)}e^{\alpha}}{Sum(D)}\\ \end{equation*}

D is a probability distribution, so the sum of all the elements in D must be 1.0.

Let's consider the entire AdaBoost process:

Create a set of weak classifiers using AdaBoost

In this section, we'll apply the AdaBoost algorithm to labeled data. As we evaluate each of the classifiers, we will score them with an alpha value. Finally, we sum the product of the predicted labels and alpha for each point to create a matrix of floats. Each value in this matrix has a sign, which should correspond to the correct lables if our error went to zero.



In [18]:

    
datMat,classLabels=loadSimpData()
adaBoostTrainDS(datMat,classLabels,9)









    



total error:  0.2
total error:  0.2
total error:  0.0






    Out[18]:





([{'alpha': 0.6931471805599453, 'feature': 1, 'ineq': 'lt', 'thresh': 1.3},
  {'alpha': 0.9729550745276565, 'feature': 2, 'ineq': 'lt', 'thresh': 1.0},
  {'alpha': 0.8958797346140273,
   'feature': 1,
   'ineq': 'lt',
   'thresh': 0.90000000000000002}],
 matrix([[ 1.17568763],
         [ 2.56198199],
         [-0.77022252],
         [-0.77022252],
         [ 0.61607184]]))

Application of AdaBoost

With the code that we've already written, we have list of weak classifiers and with their corresponding alpha scores:

    [
      {'alpha': 0.6931471805599453, 'feature': 1, 'ineq': 'lt', 'thresh': 1.3}
    , {'alpha': 0.9729550745276565, 'feature': 2, 'ineq': 'lt', 'thresh': 1.0}
    , {'alpha': 0.8958797346140273, 'feature': 1, 'ineq': 'lt', 'thresh': 0.9}
    ]

So we can reuse the threshold value of the corresponding features in each of these weak classifiers as a stump to label the unknown data. We'll recycle stumpClassify() with this training data, which means that we can rate classifier's lable using the previously assigned alpha value. See adaClassify().



In [15]:

    
simple_application()









    



Building training set.
total error:  0.2
total error:  0.2
total error:  0.0

classifierArr:






    





[{'alpha': 0.6931471805599453, 'feature': 1, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'feature': 2, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273,
  'feature': 1,
  'ineq': 'lt',
  'thresh': 0.90000000000000002}]






    



Classification of unknown point:
[[-0.69314718]]
[[-1.66610226]]
[[-2.56198199]]

(notes below borrow from Eric Emer's presentation)

Pros/Cons of AdaBoost

Pros

Fast
Simple and easy to program
No parameters to tune
No prior knowledge needed about weak learner
Provably effective given Weak Learning Assumption
versatile

Cons

Weak classifiers too complex leads to overfitting.
Weak classifiers too weak can lead to low margins, and can also lead to overfitting.
From empirical evidence, AdaBoost is particularly vulnerable to uniform noise.

Summary

How does AdaBoost optimize weights?

The data points that have been misclassified most by the previous weak classifier are pinpointed and become the focus for the next iteration. By pinpointed, we see these reguarly misclassified elements receiving a larger weight and associated larger error.

How does AdaBoost aggregate many weak classifiers into a single prediction?

With the score (alpha value) applied to the prediction set for each classifier, we aggregate the scores by their index value. The aggregated vector provides an optimally weighted majority vote of weak classifiers!

See Rober Schapire's Explaining Adaboost for a good discussion on Adaboost.

Appendix

Random notes:

Bagging

reshuffle your training data to create k different trainig sets and learn
Combine the k different classifiers by majority voting

Boosting

Assign different weights to training samples in a “smart” way so that different classifiers pay more attention to different samples
Weighted majority voting, the weight of individual classifier is proportional to its accuracy
Ada-boost (1996) was influenced by bagging, and it is superior to bagging

Non linearly separable example

http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html



In [16]:

    
print(__doc__)

# Author: Noel Dawe <noel.dawe@gmail.com>
#
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles


# Construct dataset
X1, y1 = make_gaussian_quantiles(cov=2.,
                                 n_samples=200, n_features=2,
                                 n_classes=2, random_state=1)
X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
                                 n_samples=300, n_features=2,
                                 n_classes=2, random_state=1)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, - y2 + 1))

# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bdt.fit(X, y)

plot_colors = "br"
plot_step = 0.02
class_names = "AB"

plt.figure(figsize=(10, 5))

# Plot the decision boundaries
plt.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))

Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")

# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1],
                c=c, cmap=plt.cm.Paired,
                label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Decision Boundary')

# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y == i],
             bins=10,
             range=plot_range,
             facecolor=c,
             label='Class %s' % n,
             alpha=.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Decision Scores')

plt.tight_layout()
plt.subplots_adjust(wspace=0.35)
plt.show()









    



Automatically created module for IPython interactive environment



In [ ]:



In [ ]:



In [ ]: