Grid-Search Crossvalidation

Here we implement the procedure recommended by Hsu, Chang and Lin for a robust application of an SVM via grid-searched model parameters optimized by training crossvalidation. No great improvement is found for this dataset.


In [1]:
# helpers

import json
import random
import numpy as np

def printSummary(title, TT, TF, FT, FF):
    print title
    print '\t Correct flags:', TT
    print '\t False positive:', TF
    print '\t False negative:', FT
    print '\t Correct pass:', FF

def shuffleLists(a, b):
  '''
  given two lists a, b, shuffle them maintaining pairwise correspondence.
  thanks http://stackoverflow.com/questions/13343347/randomizing-two-lists-and-maintaining-order-in-python
  '''

  combined = zip(a, b)
  random.seed(2154)
  random.shuffle(combined)

  a[:], b[:] = zip(*combined)

def reloadData(): 
    ## read raw data
    with open('../../../AutoQC_raw/true.dat') as true_data:    
        truth = json.load(true_data)

    with open('../../../AutoQC_raw/results.dat') as results_data:    
        rawResults = json.load(results_data)
        
    return truth, rawResults

def transpose(lists):
    '''
    return the transpose of lists, a list of lists.
    all the inner lists had better be the same length!
    '''

    T = []
    for i in range(len(lists[0])):
        T.append([None]*len(lists))

    for i in range(len(lists)):
        for j in range(len(lists[0])):
            T[j][i] = lists[i][j]

    return T

def reloadData(): 
    ## read raw data
    with open('../../../AutoQC_raw/true.dat') as true_data:    
        truth = json.load(true_data)

    with open('../../../AutoQC_raw/results.dat') as results_data:    
        rawResults = json.load(results_data)
        
    return truth, rawResults

def runClassifier(classifier, trainingSize):
    '''
    given a scikit-learn classifier, train it on the first trainingSize points of data and truth,
    and return the prediction classes on the remainder of data
    '''
    #load and arrange data
    truth, rawResults = reloadData()
    data = transpose(rawResults) #arrange data into rows by profile for consumption by scikit-learn
    shuffleLists(data, truth)    #randomize order of profiles
    
    #train svm
    classifier.fit(data[0:trainingSize], truth[0:trainingSize])

    #predict values for remainder of profiles
    TT = 0.
    TF = 0.
    FT = 0.
    FF = 0.

    for i in range(trainingSize, len(truth)):
        assessment = classifier.predict(data[i])
        if assessment and truth[i]:
            TT += 1
        elif assessment and not truth[i]:
            TF += 1
        elif not assessment and truth[i]:
            FT += 1
        elif not assessment and not truth[i]:
            FF += 1  
            
    return TT, TF, FT, FF

truth, rawResults = reloadData()
datasetSize = len(truth)
data = transpose(rawResults)
shuffleLists(data, truth)
trainingSize = 5000

In [2]:
#crossvalidation

def crossvalidate(classifier, data, truth, folds):
    '''
    run an n-fold crossvalidation on the training data and classifier provided
    '''
    
    foldSize = len(truth) / folds
    correct = 0.
    
    for i in range(folds):
        testData = data[i*foldSize : (i+1)*foldSize]
        testTruth = truth[i*foldSize : (i+1)*foldSize]
        trainingData = data[: i*foldSize] + data[(i+1)*foldSize:]
        trainingTruth = truth[: i*foldSize] + truth[(i+1)*foldSize:]
        
        classifier.fit(trainingData, trainingTruth)
        
        for j in range(foldSize):
            guess = classifier.predict(testData[j])
            if guess and testTruth[j]:
                correct += 1
            elif not guess and not testTruth[j]:
                correct += 1
                
    return correct / len(truth)

In [3]:
from sklearn import svm

gammaGrid = [2**-15, 2**-13, 2**-11, 2**-9, 2**-7, 2**-5, 2**-3, 2**-1, 2, 8]
cGrid = [2**-5, 2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]
results = []

for g in gammaGrid:
    for c in cGrid:
        clf = svm.SVC(C=c, gamma=g, kernel='rbf')
        results.append((crossvalidate(clf, data[0:trainingSize], truth[0:trainingSize], 5), c, g))

from operator import itemgetter
max(results,key=itemgetter(0))


Out[3]:
(0.919, 0.125, 2)

In [4]:
clf = svm.SVC(C=.125, gamma=2, kernel='rbf')
TT, TF, FT, FF = runClassifier(clf, trainingSize)
printSummary('SVM with grid-searched RBF kernel', TT/(datasetSize-trainingSize), TF/(datasetSize-trainingSize), FT/(datasetSize-trainingSize), FF/(datasetSize-trainingSize))


SVM with grid-searched RBF kernel
	 Correct flags: 0.0553164397461
	 False positive: 0.0200149880295
	 False negative: 0.0612453328205
	 Correct pass: 0.863423239404

So we see the RBF kernel rising to the level of the default performance of the linear kernel. We attempt the same procedure with a linear kernel:


In [5]:
cGrid = [2**-5, 2**-3, 2**-1, 2, 2**3, 2**5, 2**7, 2**9, 2**11, 2**13, 2**15]
results = []

for c in cGrid:
    clf = svm.SVC(C=c, kernel='linear')
    results.append((crossvalidate(clf, data[0:trainingSize], truth[0:trainingSize], 5), c))

from operator import itemgetter
max(results,key=itemgetter(0))


Out[5]:
(0.9162, 2)

In [6]:
clf = svm.SVC(C=2, kernel='linear')
TT, TF, FT, FF = runClassifier(clf, trainingSize)
printSummary('SVM with grid-searched linear kernel', TT/(datasetSize-trainingSize), TF/(datasetSize-trainingSize), FT/(datasetSize-trainingSize), FF/(datasetSize-trainingSize))


SVM with grid-searched linear kernel
	 Correct flags: 0.0544741920723
	 False positive: 0.0184100751391
	 False negative: 0.0620875804943
	 Correct pass: 0.865028152294

In [ ]: