Here we implement the procedure recommended by Hsu, Chang and Lin for a robust application of an SVM via grid-searched model parameters optimized by training crossvalidation. No great improvement is found for this dataset.
In [1]:
# helpers
import json
import random
import numpy as np
def printSummary(title, TT, TF, FT, FF):
print title
print '\t Correct flags:', TT
print '\t False positive:', TF
print '\t False negative:', FT
print '\t Correct pass:', FF
def shuffleLists(a, b):
'''
given two lists a, b, shuffle them maintaining pairwise correspondence.
thanks http://stackoverflow.com/questions/13343347/randomizing-two-lists-and-maintaining-order-in-python
'''
combined = zip(a, b)
random.seed(2154)
random.shuffle(combined)
a[:], b[:] = zip(*combined)
def reloadData():
## read raw data
with open('../../../AutoQC_raw/true.dat') as true_data:
truth = json.load(true_data)
with open('../../../AutoQC_raw/results.dat') as results_data:
rawResults = json.load(results_data)
return truth, rawResults
def transpose(lists):
'''
return the transpose of lists, a list of lists.
all the inner lists had better be the same length!
'''
T = []
for i in range(len(lists[0])):
T.append([None]*len(lists))
for i in range(len(lists)):
for j in range(len(lists[0])):
T[j][i] = lists[i][j]
return T
def reloadData():
## read raw data
with open('../../../AutoQC_raw/true.dat') as true_data:
truth = json.load(true_data)
with open('../../../AutoQC_raw/results.dat') as results_data:
rawResults = json.load(results_data)
return truth, rawResults
def runClassifier(classifier, trainingSize):
'''
given a scikit-learn classifier, train it on the first trainingSize points of data and truth,
and return the prediction classes on the remainder of data
'''
#load and arrange data
truth, rawResults = reloadData()
data = transpose(rawResults) #arrange data into rows by profile for consumption by scikit-learn
shuffleLists(data, truth) #randomize order of profiles
#train svm
classifier.fit(data[0:trainingSize], truth[0:trainingSize])
#predict values for remainder of profiles
TT = 0.
TF = 0.
FT = 0.
FF = 0.
for i in range(trainingSize, len(truth)):
assessment = classifier.predict(data[i])
if assessment and truth[i]:
TT += 1
elif assessment and not truth[i]:
TF += 1
elif not assessment and truth[i]:
FT += 1
elif not assessment and not truth[i]:
FF += 1
return TT, TF, FT, FF
truth, rawResults = reloadData()
datasetSize = len(truth)
data = transpose(rawResults)
shuffleLists(data, truth)
trainingSize = 5000
In [2]:
#crossvalidation
def crossvalidate(classifier, data, truth, folds):
'''
run an n-fold crossvalidation on the training data and classifier provided
'''
foldSize = len(truth) / folds
correct = 0.
for i in range(folds):
testData = data[i*foldSize : (i+1)*foldSize]
testTruth = truth[i*foldSize : (i+1)*foldSize]
trainingData = data[: i*foldSize] + data[(i+1)*foldSize:]
trainingTruth = truth[: i*foldSize] + truth[(i+1)*foldSize:]
classifier.fit(trainingData, trainingTruth)
for j in range(foldSize):
guess = classifier.predict(testData[j])
if guess and testTruth[j]:
correct += 1
elif not guess and not testTruth[j]:
correct += 1
return correct / len(truth)
In [3]:
from sklearn import svm
gammaGrid = [2**-15, 2**-13, 2**-11, 2**-9, 2**-7, 2**-5, 2**-3, 2**-1, 2, 8]
cGrid = [2**-5, 2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]
results = []
for g in gammaGrid:
for c in cGrid:
clf = svm.SVC(C=c, gamma=g, kernel='rbf')
results.append((crossvalidate(clf, data[0:trainingSize], truth[0:trainingSize], 5), c, g))
from operator import itemgetter
max(results,key=itemgetter(0))
Out[3]:
In [4]:
clf = svm.SVC(C=.125, gamma=2, kernel='rbf')
TT, TF, FT, FF = runClassifier(clf, trainingSize)
printSummary('SVM with grid-searched RBF kernel', TT/(datasetSize-trainingSize), TF/(datasetSize-trainingSize), FT/(datasetSize-trainingSize), FF/(datasetSize-trainingSize))
So we see the RBF kernel rising to the level of the default performance of the linear kernel. We attempt the same procedure with a linear kernel:
In [5]:
cGrid = [2**-5, 2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]
results = []
for c in cGrid:
clf = svm.SVC(C=c, kernel='linear')
results.append((crossvalidate(clf, data[0:trainingSize], truth[0:trainingSize], 5), c))
from operator import itemgetter
max(results,key=itemgetter(0))
Out[5]:
In [6]:
clf = svm.SVC(C=2, kernel='linear')
TT, TF, FT, FF = runClassifier(clf, trainingSize)
printSummary('SVM with grid-searched linear kernel', TT/(datasetSize-trainingSize), TF/(datasetSize-trainingSize), FT/(datasetSize-trainingSize), FF/(datasetSize-trainingSize))
In [ ]: