Splice Dataset


In [1]:
from sklearn import svm
from sklearn import cross_validation as cv
import pandas as pd

plt.style.use('ggplot')

from kcat.datasets import Splice
from kcat.kernels import helpers as kh
from kcat.kernels import search as ks

In [2]:
# Load the dataset
dataset = Splice()

In [3]:
data = {'Kernel': [], 'Test Error': []}
for i in range(20):
    print("Iteration {}".format(i))
    # Split train and test
    X_train, X_test, y_train, y_test = dataset.train_test_split(train_size=200, test_size=400, random_state=i)
    # Cross-validation
    cvf = cv.StratifiedKFold(y_train, 10)
    # Evaluate models
    for model_class in kh.RBF, kh.K0, kh.K1, kh.M3, kh.M4, kh.M5, kh.M6, kh.M7, kh.M9, kh.MC, kh.MD, kh.ME:
        model = model_class()
        print(model.name, end=', ')
        best_fit = model.train(cvf, X_train, y_train)
        results = model.test(best_fit, X_test, y_test)
        data['Kernel'].append(model.name)
        data['Test Error'].append(1 - results['test_score'])
    print()


Iteration 0
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 1
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 2
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 3
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 4
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 5
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 6
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 7
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 8
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 9
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 10
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 11
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 12
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 13
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 14
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 15
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 16
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 17
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 18
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 19
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 

In [4]:
df = pd.DataFrame(data=data)
df.groupby('Kernel')['Test Error'].mean()


Out[4]:
Kernel
K0        0.102875
K1        0.116375
M3        0.108000
M4        0.098250
M5        0.124750
M6        0.107125
M7        0.100000
M9        0.104250
MC        0.100000
MD        0.107000
ME        0.104375
RBF       0.105875
Name: Test Error, dtype: float64

In [5]:
univ_kernels = (df.Kernel == 'K0') | (df.Kernel == 'K1') | (df.Kernel == 'RBF')
df[univ_kernels].boxplot(by='Kernel');



In [6]:
df.boxplot(by='Kernel');