AdaBoosting



In [2]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt









    



/Users/xpgeng/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Load data



In [3]:

    
df = pd.DataFrame({'x1':[1., 2., 1.3, 1., 2.], 'x2': [2.1, 1.1, 1., 1., 1.], 
                   'Labels': [1.0, 1.0, -1.0, -1.0, 1.0]})



In [4]:

    
datMat = df.loc[:, ['x1', 'x2']].values



In [5]:

    
df



In [6]:

    
labels = df.Labels.values



In [7]:

    
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0], ))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray



In [8]:

    
def buildStump(dataMatrix, labels, D):
    '''
    '''
    # Initial values
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClassEst = np.zeros((m, ))
    minError = np.inf
    
    # for-loop about features
    for i in range(n):
        rangeMin = dataMatrix[:, i].min()
        rangeMax = dataMatrix[:, i].max()
        stepSize = (rangeMax - rangeMin)/numSteps
        
        # for-loop about steps
        for j in range(-1, int(numSteps) + 1): # -1 means threshVal less than minVal
            
            # for-loop about inequality
            for inequal in ['lt', 'gt']: # try both less than and greater than  to check the error
                
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.ones((m, ))
                errArr[predictedVals == labels] = 0
                weightedError = D.dot(errArr) # scalar value
                # print "split: dim %d, thresh %.2f, thresh ineqal: %s,\
                # the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst



In [9]:

    
D = np.ones((5, ))/5.



In [10]:

    
buildStump(datMat, labels, D)









    Out[10]:





({'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 0.20000000000000001,
 array([-1.,  1., -1., -1.,  1.]))



In [33]:

    
def adaBoostingTrainDS(datMat, labels, numIt=40):
    weakClassArr = []
    m = np.shape(datMat)[0]
    D = np.ones((m, ))/m
    aggClassEst = np.zeros((m, ))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(datMat, labels, D)
        # print "D:", D
        alpha = float(0.5*np.log((1.0-error)/max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        # print "ClassEst: ", classEst
        expon = -1*alpha*labels*classEst
        D = D*np.exp(expon)
        D = D/D.sum()
        aggClassEst += alpha*classEst
        # print "aggClassEst: ", aggClassEst
        aggErrors = (np.sign(aggClassEst) != labels) * np.ones((m, ))
        errorRate = aggErrors.sum()/m
        print "total error:", errorRate, "\n"
        if errorRate == 0.0:
            break
    # return weakClassArr # original return
    return weakClassArr, aggClassEst # used for ROC



In [12]:

    
classifierArray = adaBoostingTrainDS(datMat, labels, 40)









    



total error: 0.2 

total error: 0.2 

total error: 0.0



In [13]:

    
classifierArray









    Out[13]:





[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273,
  'dim': 0,
  'ineq': 'lt',
  'thresh': 0.90000000000000002}]

Test



In [14]:

    
def adaClassify(dataMatrix, classifierArr):
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.zeros((m, ))
    for i in range(len(classifierArray)):
        classEst = stumpClassify(dataMatrix, classifierArray[i]['dim'],
                                classifierArray[i]['thresh'], classifierArray[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        # print aggClassEst
    return np.sign(aggClassEst)



In [15]:

    
test_datMat = np.array([[0., 0.]])



In [16]:

    
adaClassify(test_datMat, classifierArray)









    Out[16]:





array([-1.])

Horse colic dataset using AdaBoosting



In [17]:

    
df_horse = pd.read_csv('Horse_colic_dataset/horseColicTraining2.txt', sep='\t', names=np.arange(22))



In [18]:

    
datMat_horse = df_horse.values[:, 0: 21]



In [19]:

    
labels_horse = df_horse.values[:, -1]

使用10个weak learner



In [20]:

    
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 10)









    



total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769



In [21]:

    
df_test = pd.read_csv('Horse_colic_dataset/horseColicTest2.txt', sep='\t', names=np.arange(22))



In [22]:

    
datMat_test = df_test.values[:, 0: 21]



In [23]:

    
labels_test = df_test.values[:, -1]



In [24]:

    
prediction = adaClassify(datMat_test, classifierArray)



In [25]:

    
errArr = np.ones((67, ))



In [26]:

    
errArr[prediction != labels_test].sum()/67









    Out[26]:





0.23880597014925373

使用50个weak learners



In [27]:

    
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 50)









    



total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769 

total error: 0.240802675585 

total error: 0.214046822742 

total error: 0.227424749164 

total error: 0.217391304348 

total error: 0.220735785953 

total error: 0.217391304348 

total error: 0.224080267559 

total error: 0.224080267559 

total error: 0.230769230769 

total error: 0.224080267559 

total error: 0.214046822742 

total error: 0.207357859532 

total error: 0.224080267559 

total error: 0.224080267559 

total error: 0.214046822742 

total error: 0.220735785953 

total error: 0.204013377926 

total error: 0.207357859532 

total error: 0.210702341137 

total error: 0.217391304348 

total error: 0.210702341137 

total error: 0.217391304348 

total error: 0.207357859532 

total error: 0.210702341137 

total error: 0.207357859532 

total error: 0.207357859532 

total error: 0.197324414716 

total error: 0.190635451505 

total error: 0.200668896321 

total error: 0.197324414716 

total error: 0.200668896321 

total error: 0.19397993311 

total error: 0.19397993311 

total error: 0.190635451505 

total error: 0.1872909699 

total error: 0.190635451505 

total error: 0.190635451505 

total error: 0.1872909699 

total error: 0.19397993311 

total error: 0.1872909699



In [28]:

    
prediction = adaClassify(datMat_test, classifierArray)



In [29]:

    
errArr = np.ones((67, ))



In [30]:

    
errArr[prediction != labels_test].sum()/67









    Out[30]:





0.20895522388059701



In [43]:

    
def plotROC(predStrengths, labels):
    cur = (1., 1.)  # start from [1, 1]
    ySum = 0.0
    numPosClas = sum(labels == 1.0)
    yStep = 1/float(numPosClas)
    xStep = 1/float(len(labels)-numPosClas)
    sortedIndicies = predStrengths.argsort()
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)
    for index in sortedIndicies.tolist():
        if labels[index] == 1.0:
            delX = 0
            delY = yStep
        else:
            delX = xStep
            delY = 0
            ySum += cur[1]
        ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')
        cur = (cur[0] - delX, cur[1] - delY)
    ax.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    ax.axis([0, 1, 0, 1])
    plt.show()
    print "the Area Under the Curve is: ", ySum*xStep



In [35]:

    
classifierArray, aggClassEst = adaBoostingTrainDS(datMat_horse, labels_horse, 10)









    



total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769



In [44]:

    
plotROC(aggClassEst, labels_horse)









    



the Area Under the Curve is:  0.858296963506



In [ ]: