In this tutorial, we'll build a stump classifier and apply the AdaBoost algorithm. Our goal is to transform a weak classifier into something useful.
This lecture covers the first part of chapter 7 in Peter Harrington's book (Harrington, P. (2012). Machine Learning in Action. Shelter Island, NY: Manning) with some added commentary.
In [9]:
# base requirements
from IPython.display import Image
from IPython.display import display
from datetime import *
import json
from copy import *
from pprint import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import rpy2
%load_ext rpy2.ipython
%R require("ggplot2")
% matplotlib inline
from ggplot import *
randn = np.random.randn
# optional
import warnings
warnings.filterwarnings('ignore')
# tutorial requirements
#bokeh - http://bokeh.pydata.org/en/latest/docs/installation.html
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
output_notebook() # inline graphs
#import bokeh.sampledata # this download is commented out b/c it's optional
# bokeh.sampledata.download() # this download is commented out b/c it's optional
In [10]:
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
"""
Performs a threshold comparison to classify data.
Everything on one side of the threshold is thrown into class -1,
and everything on the other side is thrown into class +1.
"""
retArray = np.ones((np.shape(dataMatrix)[0],1))
#print "retArray"
#display(retArray)
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = 1.0
return retArray
def buildStump(dataArr,classLabels,D):
"""
Iterates over all of the possible inputs to stumpClassify() and finds
the best decision stump for our dataset. Best here will be with respect
to the data weight vector D.
"""
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
#print "dataMatrix:"
#display(dataMatrix)
#print "labelMat:"
#display(labelMat)
m,n = np.shape(dataMatrix)
#print ("m:{}, n:{}".format(m,n))
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
#print "bestClasEst:"
#display(bestClasEst)
minError = np.inf #init error sum, to +infinity
#print "minError:"
#display(minError)
#The first one goes over all the features in our dataset. You’re
# considering numeric values, and you calculate the minimum and
# maximum to see how large your step size should be.
for i in range(n):#loop over all dimensions
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps
#print "stepSize:{}".format(stepSize)
# The next for loops loop over these values.
for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
#The last for loop toggles your inequality between greater than and less than
for inequal in ['lt', 'gt']: #go over less than and greater than
threshVal = (rangeMin + float(j) * stepSize) #value at which we make our decision to classify one way or another
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #returns labels for each element
errArr = np.mat(np.ones((m,1)))
errArr[predictedVals == labelMat] = 0
#print "\n\nerrArr:"
#display(errArr)
#display(D.T)
weightedError = D.T*errArr #calc total error multiplied by D <---------D is constant in this function but varied inside AdaBoost
#print "weightedError:"
#display(weightedError)
#####
##### uncomment line below for 1st pass
#####
#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError: #finds thhe best stump
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['feature'] = i+1
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
def alpha(error):
return float(0.5*np.log((1.0-error)/max(error,1e-16)))
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
"""
The implementation of AdaBoost. We get back a set of weak \
classifiers and weights (the signs of which we use as labels).
"""
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m,1))/m) #init D to all weights being equal
aggClassEst = np.mat(np.zeros((m,1))) #init to zero
for i in range(numIt):
bestStump,error,classEst = buildStump(dataArr,classLabels,D)# note: D varies to improve the classifier
alpha = float(0.5*np.log((1.0-error)/max(error,1e-16)))#calc alpha; note: max(error,eps) accounts for error=0
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) #store Stump Params in Array
#print "classEst: ",classEst.T
expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) #exponent for D calc, notice that multiplying \
# np.mat(classLabels).T & classEst is for sign \
# that drives D values to 0 or 1
D = np.multiply(D,np.exp(expon)) #Calc New D for next iteration
D = D/D.sum() # D.sum() normalizes the values as probabilities that all sum to 1
#calc training error of all classifiers, if this is 0 quit for loop early (use break)
aggClassEst += alpha*classEst # <----- the magic; this allows the signs (labels) to be pushed around
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1))) # 1's when error
errorRate = aggErrors.sum()/m # percent error
print "total error: ",errorRate
if errorRate == 0.0: break
return weakClassArr,aggClassEst
def adaClassify(datToClass,classifierArr):
"""
Given an unknown datum, we label it from training data.
"""
dataMatrix = np.mat(datToClass)
m = np.shape(dataMatrix)[0]
#print "m:{}".format(m)
aggClassEst = np.mat(np.zeros((m,1))) # predicted values
#print "initial aggClassEst:{}".format(aggClassEst)
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix,classifierArr[i]['feature']-1\
, classifierArr[i]['thresh']\
, classifierArr[i]['ineq'])#call stump classify
aggClassEst += classifierArr[i]['alpha']*classEst
print aggClassEst
return np.sign(aggClassEst)
def loadData():
"""
Loads sample dataset as arrays.
"""
datMat = np.array([[ 1. , 2.1],
[2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
classLabels = np.array([1.0, 1.0, -1.0, -1.0, 1.0])
return datMat,classLabels
def loadSimpData():
"""
Loads dataset as matrix.
"""
datMat = np.matrix([[ 1. , 2.1],
[2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
def build_simple_bokeh_graph():
data,labels = loadData()
print "data:"
display(data)
print "labels:"
display(labels)
print "Feature 1 data:"
d1 = data[(labels==1)]
display(d1)
print "Feature 2 data:"
d2 = data[(labels==-1)]
display(d2)
## set up Bokeh figure
p = figure(
tools="pan,box_zoom,reset,save"
, title="Data: Two Features & Two Classes"
#y_axis_type="log", y_range=[0.001, 10**11]
, x_axis_label='Feature 1'
, y_axis_label='Feature 2'
)
## add data to Bokeh figure
p.scatter(d1[:,0], d1[:,1], legend="class1", fill_color="red", size=20,marker="circle")
p.scatter(d2[:,0], d2[:,1], legend="class2", fill_color="blue", size=20,marker="square")
# display Bokeh figure
show(p)
def run_stump():
# run stump classifier without adaboost
datMat,classLabels=loadSimpData()
print "Data:"
display(datMat)
D = np.mat( np.ones((5,1)) / 5 )
print "initial D:"
display(D)
numSteps = 10.0;
print "TEST:"
x,y,z=buildStump(datMat,classLabels,D) # note: D is constant here, but this is the value that we will vary with adaboost.
print "\n\nRESTULS:"
print " bestStump:{}".format(x)
print " smallest error:{}".format(y)
print " predicted labels:"
display(z)
def graph_alpha():
# Create graph of alpha values
x = np.arange(0.01,1,0.01)
alpha_calc = np.vectorize(alpha)
y = alpha_calc(x)
## Bokeh output inline
#output_notebook()
## set up Bokeh figure
p = figure(
tools="pan,box_zoom,reset,save"
, title="How are the classifiers scored?"
#y_axis_type="log", y_range=[0.001, 10**11]
, x_axis_label='Error'
, y_axis_label='Alpha'
)
## add data to Bokeh figure
p.line(x, y, legend="alpha curve", color="blue", line_width=2)
# guide line
a = np.array([.5,.5])
b = np.array([-1,1])
p.line(a,b, legend="50% error", color="red",line_width = 1, alpha=0.6, line_dash="4 4")
# display Bokeh figure
show(p)
def simple_application():
datArr,labelArr=loadSimpData()
print "Building training set."
classifierArr = adaBoostTrainDS(datArr,labelArr,30)
print "\nclassifierArr:"
display(classifierArr[0])
print "Classification of unknown point:"
adaClassify([0, 0],classifierArr[0])
In [11]:
build_simple_bokeh_graph()
Which individual feature best helps us classify this dataset? As you might note, we'll always have an error. As such, we could call this method a week classifier.
Let's first see how to build a decision stump, test if any of values are less than or greater than the threshold value we’re testing, and then loop over a weighted version of the dataset to find the stump that yields the lowest error.
One important distinction at this point is that we're using equal weights across all elements in the dataset. Late, we'll use the AdaBoost algorithm to change these weights to optimize the accuracy of the labels.
In [12]:
run_stump()
After building our stump classifier, we'll try to improve it using AdaBoost. We're going to change one set of values: D
, which is a vector of weights. We'll change D
through an iterative process. This weight vector adjust for incorrect labels. So we'll change D
by evaluating those labels that we classified incorrectly and increasing their weight while simultaneously decreasing the weight on those values that we classify correctly. Initially, all of these weights will be equal, but at each iteration we'll re-evaluate the weights to adjust for failure/success. Hence, each point in the dataset will receive a custom weight depending on how well we classified it in the last iteration.
To calculate alpha, \alpha, we then sum up the weighted errors for each stump. In short, the vector D
is varied per stump - each of which is scored with an alpha value.
Before we move on to undersatand how adaboots uses the our sets of alpha values, let's look a little more deeply at what this score means.
We calculate our error rate with \begin{equation*} \epsilon = \frac{number\ of\ incorrectly\ classified\ examples}{total\ number\ of\ examples}\\ \end{equation*} These errors are multiplied by the weights and then the alpha value is calculated as follows: \begin{equation*} \alpha = \frac{1}{2}ln(\frac{1-\epsilon}{\epsilon}) \end{equation*}
Let's look at a graph of alpha values.
In [13]:
graph_alpha()
(see Chris McCormick's discussion https://chrisjmccormick.wordpress.com/2013/12/13/adaboost-tutorial/)
We end up using alpha through a series of iterations that drive the labeling error closer to zero. The way this works is that we sum together the product of alpha and each stump's predicted values, which provides a vector of floats whose signs indicate our labels.
We now understand that alpha relates to the sum of errors and is in some way associated with how much to weight each stump. Now we just need to understand how alpha (\alpha) relates to the individualized weights in vector D
:
Correctly predicted, \begin{equation*} D_{i}^{(t+1)}= \frac{D_{i}^{(t)}e^{-\alpha}}{Sum(D)}\\ \end{equation*} Incorrectly predicted, \begin{equation*} D_{i}^{(t+1)}= \frac{D_{i}^{(t)}e^{\alpha}}{Sum(D)}\\ \end{equation*}
D is a probability distribution, so the sum of all the elements in D must be 1.0.
Let's consider the entire AdaBoost process:
In this section, we'll apply the AdaBoost algorithm to labeled data. As we evaluate each of the classifiers, we will score them with an alpha value. Finally, we sum the product of the predicted labels and alpha for each point to create a matrix of floats. Each value in this matrix has a sign, which should correspond to the correct lables if our error went to zero.
In [18]:
datMat,classLabels=loadSimpData()
adaBoostTrainDS(datMat,classLabels,9)
Out[18]:
With the code that we've already written, we have list of weak classifiers and with their corresponding alpha scores:
[ {'alpha': 0.6931471805599453, 'feature': 1, 'ineq': 'lt', 'thresh': 1.3} , {'alpha': 0.9729550745276565, 'feature': 2, 'ineq': 'lt', 'thresh': 1.0} , {'alpha': 0.8958797346140273, 'feature': 1, 'ineq': 'lt', 'thresh': 0.9} ]So we can reuse the threshold value of the corresponding features in each of these weak classifiers as a stump to label the unknown data. We'll recycle
stumpClassify()
with this training data, which means that we can rate classifier's lable using the previously assigned alpha value. See adaClassify()
.
In [15]:
simple_application()
(notes below borrow from Eric Emer's presentation)
Pros
Cons
The data points that have been misclassified most by the previous weak classifier are pinpointed and become the focus for the next iteration. By pinpointed, we see these reguarly misclassified elements receiving a larger weight and associated larger error.
With the score (alpha value) applied to the prediction set for each classifier, we aggregate the scores by their index value. The aggregated vector provides an optimally weighted majority vote of weak classifiers!
See Rober Schapire's Explaining Adaboost for a good discussion on Adaboost.
Bagging
Boosting
http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html
In [16]:
print(__doc__)
# Author: Noel Dawe <noel.dawe@gmail.com>
#
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles
# Construct dataset
X1, y1 = make_gaussian_quantiles(cov=2.,
n_samples=200, n_features=2,
n_classes=2, random_state=1)
X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
n_samples=300, n_features=2,
n_classes=2, random_state=1)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, - y2 + 1))
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",
n_estimators=200)
bdt.fit(X, y)
plot_colors = "br"
plot_step = 0.02
class_names = "AB"
plt.figure(figsize=(10, 5))
# Plot the decision boundaries
plt.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")
# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1],
c=c, cmap=plt.cm.Paired,
label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Decision Boundary')
# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
for i, n, c in zip(range(2), class_names, plot_colors):
plt.hist(twoclass_output[y == i],
bins=10,
range=plot_range,
facecolor=c,
label='Class %s' % n,
alpha=.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Decision Scores')
plt.tight_layout()
plt.subplots_adjust(wspace=0.35)
plt.show()
In [ ]:
In [ ]:
In [ ]: