AdaBoosting


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


/Users/xpgeng/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Load data


In [3]:
df = pd.DataFrame({'x1':[1., 2., 1.3, 1., 2.], 'x2': [2.1, 1.1, 1., 1., 1.], 
                   'Labels': [1.0, 1.0, -1.0, -1.0, 1.0]})

In [4]:
datMat = df.loc[:, ['x1', 'x2']].values

In [5]:
df


Out[5]:
Labels x1 x2
0 1.0 1.0 2.1
1 1.0 2.0 1.1
2 -1.0 1.3 1.0
3 -1.0 1.0 1.0
4 1.0 2.0 1.0

In [6]:
labels = df.Labels.values

In [7]:
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0], ))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

In [8]:
def buildStump(dataMatrix, labels, D):
    '''
    '''
    # Initial values
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClassEst = np.zeros((m, ))
    minError = np.inf
    
    # for-loop about features
    for i in range(n):
        rangeMin = dataMatrix[:, i].min()
        rangeMax = dataMatrix[:, i].max()
        stepSize = (rangeMax - rangeMin)/numSteps
        
        # for-loop about steps
        for j in range(-1, int(numSteps) + 1): # -1 means threshVal less than minVal
            
            # for-loop about inequality
            for inequal in ['lt', 'gt']: # try both less than and greater than  to check the error
                
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.ones((m, ))
                errArr[predictedVals == labels] = 0
                weightedError = D.dot(errArr) # scalar value
                # print "split: dim %d, thresh %.2f, thresh ineqal: %s,\
                # the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst

In [9]:
D = np.ones((5, ))/5.

In [10]:
buildStump(datMat, labels, D)


Out[10]:
({'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 0.20000000000000001,
 array([-1.,  1., -1., -1.,  1.]))

In [33]:
def adaBoostingTrainDS(datMat, labels, numIt=40):
    weakClassArr = []
    m = np.shape(datMat)[0]
    D = np.ones((m, ))/m
    aggClassEst = np.zeros((m, ))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(datMat, labels, D)
        # print "D:", D
        alpha = float(0.5*np.log((1.0-error)/max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        # print "ClassEst: ", classEst
        expon = -1*alpha*labels*classEst
        D = D*np.exp(expon)
        D = D/D.sum()
        aggClassEst += alpha*classEst
        # print "aggClassEst: ", aggClassEst
        aggErrors = (np.sign(aggClassEst) != labels) * np.ones((m, ))
        errorRate = aggErrors.sum()/m
        print "total error:", errorRate, "\n"
        if errorRate == 0.0:
            break
    # return weakClassArr # original return
    return weakClassArr, aggClassEst # used for ROC

In [12]:
classifierArray = adaBoostingTrainDS(datMat, labels, 40)


total error: 0.2 

total error: 0.2 

total error: 0.0 


In [13]:
classifierArray


Out[13]:
[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
 {'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
 {'alpha': 0.8958797346140273,
  'dim': 0,
  'ineq': 'lt',
  'thresh': 0.90000000000000002}]

Test


In [14]:
def adaClassify(dataMatrix, classifierArr):
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.zeros((m, ))
    for i in range(len(classifierArray)):
        classEst = stumpClassify(dataMatrix, classifierArray[i]['dim'],
                                classifierArray[i]['thresh'], classifierArray[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        # print aggClassEst
    return np.sign(aggClassEst)

In [15]:
test_datMat = np.array([[0., 0.]])

In [16]:
adaClassify(test_datMat, classifierArray)


Out[16]:
array([-1.])

Horse colic dataset using AdaBoosting


In [17]:
df_horse = pd.read_csv('Horse_colic_dataset/horseColicTraining2.txt', sep='\t', names=np.arange(22))

In [18]:
datMat_horse = df_horse.values[:, 0: 21]

In [19]:
labels_horse = df_horse.values[:, -1]

使用10个weak learner


In [20]:
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 10)


total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769 


In [21]:
df_test = pd.read_csv('Horse_colic_dataset/horseColicTest2.txt', sep='\t', names=np.arange(22))

In [22]:
datMat_test = df_test.values[:, 0: 21]

In [23]:
labels_test = df_test.values[:, -1]

In [24]:
prediction = adaClassify(datMat_test, classifierArray)

In [25]:
errArr = np.ones((67, ))

In [26]:
errArr[prediction != labels_test].sum()/67


Out[26]:
0.23880597014925373

使用50个weak learners


In [27]:
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 50)


total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769 

total error: 0.240802675585 

total error: 0.214046822742 

total error: 0.227424749164 

total error: 0.217391304348 

total error: 0.220735785953 

total error: 0.217391304348 

total error: 0.224080267559 

total error: 0.224080267559 

total error: 0.230769230769 

total error: 0.224080267559 

total error: 0.214046822742 

total error: 0.207357859532 

total error: 0.224080267559 

total error: 0.224080267559 

total error: 0.214046822742 

total error: 0.220735785953 

total error: 0.204013377926 

total error: 0.207357859532 

total error: 0.210702341137 

total error: 0.217391304348 

total error: 0.210702341137 

total error: 0.217391304348 

total error: 0.207357859532 

total error: 0.210702341137 

total error: 0.207357859532 

total error: 0.207357859532 

total error: 0.197324414716 

total error: 0.190635451505 

total error: 0.200668896321 

total error: 0.197324414716 

total error: 0.200668896321 

total error: 0.19397993311 

total error: 0.19397993311 

total error: 0.190635451505 

total error: 0.1872909699 

total error: 0.190635451505 

total error: 0.190635451505 

total error: 0.1872909699 

total error: 0.19397993311 

total error: 0.1872909699 


In [28]:
prediction = adaClassify(datMat_test, classifierArray)

In [29]:
errArr = np.ones((67, ))

In [30]:
errArr[prediction != labels_test].sum()/67


Out[30]:
0.20895522388059701

In [43]:
def plotROC(predStrengths, labels):
    cur = (1., 1.)  # start from [1, 1]
    ySum = 0.0
    numPosClas = sum(labels == 1.0)
    yStep = 1/float(numPosClas)
    xStep = 1/float(len(labels)-numPosClas)
    sortedIndicies = predStrengths.argsort()
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)
    for index in sortedIndicies.tolist():
        if labels[index] == 1.0:
            delX = 0
            delY = yStep
        else:
            delX = xStep
            delY = 0
            ySum += cur[1]
        ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')
        cur = (cur[0] - delX, cur[1] - delY)
    ax.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    ax.axis([0, 1, 0, 1])
    plt.show()
    print "the Area Under the Curve is: ", ySum*xStep

In [35]:
classifierArray, aggClassEst = adaBoostingTrainDS(datMat_horse, labels_horse, 10)


total error: 0.284280936455 

total error: 0.284280936455 

total error: 0.247491638796 

total error: 0.247491638796 

total error: 0.254180602007 

total error: 0.240802675585 

total error: 0.240802675585 

total error: 0.220735785953 

total error: 0.247491638796 

total error: 0.230769230769 


In [44]:
plotROC(aggClassEst, labels_horse)


the Area Under the Curve is:  0.858296963506

In [ ]: