In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [3]:
df = pd.DataFrame({'x1':[1., 2., 1.3, 1., 2.], 'x2': [2.1, 1.1, 1., 1., 1.],
'Labels': [1.0, 1.0, -1.0, -1.0, 1.0]})
In [4]:
datMat = df.loc[:, ['x1', 'x2']].values
In [5]:
df
Out[5]:
In [6]:
labels = df.Labels.values
In [7]:
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = np.ones((np.shape(dataMatrix)[0], ))
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
In [8]:
def buildStump(dataMatrix, labels, D):
'''
'''
# Initial values
m, n = np.shape(dataMatrix)
numSteps = 10.0
bestStump = {}
bestClassEst = np.zeros((m, ))
minError = np.inf
# for-loop about features
for i in range(n):
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin)/numSteps
# for-loop about steps
for j in range(-1, int(numSteps) + 1): # -1 means threshVal less than minVal
# for-loop about inequality
for inequal in ['lt', 'gt']: # try both less than and greater than to check the error
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
errArr = np.ones((m, ))
errArr[predictedVals == labels] = 0
weightedError = D.dot(errArr) # scalar value
# print "split: dim %d, thresh %.2f, thresh ineqal: %s,\
# the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError:
minError = weightedError
bestClassEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClassEst
In [9]:
D = np.ones((5, ))/5.
In [10]:
buildStump(datMat, labels, D)
Out[10]:
In [33]:
def adaBoostingTrainDS(datMat, labels, numIt=40):
weakClassArr = []
m = np.shape(datMat)[0]
D = np.ones((m, ))/m
aggClassEst = np.zeros((m, ))
for i in range(numIt):
bestStump, error, classEst = buildStump(datMat, labels, D)
# print "D:", D
alpha = float(0.5*np.log((1.0-error)/max(error, 1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
# print "ClassEst: ", classEst
expon = -1*alpha*labels*classEst
D = D*np.exp(expon)
D = D/D.sum()
aggClassEst += alpha*classEst
# print "aggClassEst: ", aggClassEst
aggErrors = (np.sign(aggClassEst) != labels) * np.ones((m, ))
errorRate = aggErrors.sum()/m
print "total error:", errorRate, "\n"
if errorRate == 0.0:
break
# return weakClassArr # original return
return weakClassArr, aggClassEst # used for ROC
In [12]:
classifierArray = adaBoostingTrainDS(datMat, labels, 40)
In [13]:
classifierArray
Out[13]:
In [14]:
def adaClassify(dataMatrix, classifierArr):
m = np.shape(dataMatrix)[0]
aggClassEst = np.zeros((m, ))
for i in range(len(classifierArray)):
classEst = stumpClassify(dataMatrix, classifierArray[i]['dim'],
classifierArray[i]['thresh'], classifierArray[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
# print aggClassEst
return np.sign(aggClassEst)
In [15]:
test_datMat = np.array([[0., 0.]])
In [16]:
adaClassify(test_datMat, classifierArray)
Out[16]:
In [17]:
df_horse = pd.read_csv('Horse_colic_dataset/horseColicTraining2.txt', sep='\t', names=np.arange(22))
In [18]:
datMat_horse = df_horse.values[:, 0: 21]
In [19]:
labels_horse = df_horse.values[:, -1]
使用10个weak learner
In [20]:
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 10)
In [21]:
df_test = pd.read_csv('Horse_colic_dataset/horseColicTest2.txt', sep='\t', names=np.arange(22))
In [22]:
datMat_test = df_test.values[:, 0: 21]
In [23]:
labels_test = df_test.values[:, -1]
In [24]:
prediction = adaClassify(datMat_test, classifierArray)
In [25]:
errArr = np.ones((67, ))
In [26]:
errArr[prediction != labels_test].sum()/67
Out[26]:
使用50个weak learners
In [27]:
classifierArray = adaBoostingTrainDS(datMat_horse, labels_horse, 50)
In [28]:
prediction = adaClassify(datMat_test, classifierArray)
In [29]:
errArr = np.ones((67, ))
In [30]:
errArr[prediction != labels_test].sum()/67
Out[30]:
In [43]:
def plotROC(predStrengths, labels):
cur = (1., 1.) # start from [1, 1]
ySum = 0.0
numPosClas = sum(labels == 1.0)
yStep = 1/float(numPosClas)
xStep = 1/float(len(labels)-numPosClas)
sortedIndicies = predStrengths.argsort()
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
for index in sortedIndicies.tolist():
if labels[index] == 1.0:
delX = 0
delY = yStep
else:
delX = xStep
delY = 0
ySum += cur[1]
ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')
cur = (cur[0] - delX, cur[1] - delY)
ax.plot([0, 1], [0, 1], 'b--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
ax.axis([0, 1, 0, 1])
plt.show()
print "the Area Under the Curve is: ", ySum*xStep
In [35]:
classifierArray, aggClassEst = adaBoostingTrainDS(datMat_horse, labels_horse, 10)
In [44]:
plotROC(aggClassEst, labels_horse)
In [ ]: