Soybean Dataset


In [1]:
from sklearn import svm
from sklearn import cross_validation as cv
import pandas as pd

plt.style.use('ggplot')

from kcat.datasets import Soybean
from kcat.kernels import helpers as kh
from kcat.kernels import search as ks
from kcat.utils import get_pgen

In [2]:
# Load the dataset
dataset = Soybean()

In [3]:
data = {'Kernel': [], 'Test Error': []}
for i in range(5):
    print("Iteration {}".format(i))
    # Split train and test
    X_train, X_test, y_train, y_test = dataset.train_test_split(test_size=0.33, random_state=i)
    # Cross-validation
    cvf = cv.StratifiedKFold(y_train, 10)
    # Evaluate models
    for model_class in kh.RBF, kh.K0, kh.K1, kh.M3, kh.M4, kh.M5, kh.M6, kh.M7, kh.M9, kh.MC, kh.MD, kh.ME:
        model = model_class()
        print(model.name, end=', ')
        best_fit = model.train(cvf, X_train, y_train)
        results = model.test(best_fit, X_test, y_test)
        data['Kernel'].append(model.name)
        data['Test Error'].append(1 - results['test_score'])
    print()


Iteration 0
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 1
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 2
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 3
RBF, 
/home/alkxzv/.virtualenvs/pfc/lib/python3.4/site-packages/sklearn/cross_validation.py:413: Warning: The least populated class in y has only 9 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=10.
  % (min_labels, self.n_folds)), Warning)
K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 4
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 

In [4]:
df = pd.DataFrame(data=data)
df.groupby('Kernel')['Test Error'].mean()


Out[4]:
Kernel
K0        0.097115
K1        0.083654
M3        0.075962
M4        0.083654
M5        0.075000
M6        0.077885
M7        0.081731
M9        0.076923
MC        0.081731
MD        0.075962
ME        0.079808
RBF       0.075000
Name: Test Error, dtype: float64

In [5]:
df.boxplot(by='Kernel');



In [5]: