Mushroom Dataset


In [1]:
from sklearn import svm
from sklearn import cross_validation as cv
import pandas as pd

plt.style.use('ggplot')

from kcat.datasets import Mushroom
from kcat.kernels import helpers as kh
from kcat.kernels import search as ks

In [2]:
# Load the dataset
dataset = Mushroom()

In [3]:
data = {'Kernel': [], 'Test Error': []}
for i in range(10):
    print("Iteration {}".format(i))
    # Split train and test
    X_train, X_test, y_train, y_test = dataset.train_test_split(train_size=250, test_size=500, random_state=i)
    # Cross-validation
    cvf = cv.StratifiedKFold(y_train, 10)
    # Evaluate models
    for model_class in kh.RBF, kh.K0, kh.K1, kh.M3, kh.M4, kh.M5, kh.M6, kh.M7, kh.M9, kh.MC, kh.MD, kh.ME:
        model = model_class()
        print(model.name, end=', ')
        best_fit = model.train(cvf, X_train, y_train)
        results = model.test(best_fit, X_test, y_test)
        data['Kernel'].append(model.name)
        data['Test Error'].append(1 - results['test_score'])
    print()


Iteration 0
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 1
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 2
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 3
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 4
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 5
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 6
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 7
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 8
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 9
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 

In [4]:
df = pd.DataFrame(data=data)
df.groupby('Kernel')['Test Error'].mean()


Out[4]:
Kernel
K0        0.0210
K1        0.0122
M3        0.0128
M4        0.0156
M5        0.0136
M6        0.0150
M7        0.0126
M9        0.0198
MC        0.0126
MD        0.0136
ME        0.0144
RBF       0.0120
Name: Test Error, dtype: float64

In [5]:
df.boxplot(by='Kernel');