Congressional Voting Dataset


In [1]:
from sklearn import svm
from sklearn import cross_validation as cv
import pandas as pd

plt.style.use('ggplot')

from kcat.datasets import CongressionalVoting
from kcat.kernels import helpers as kh
from kcat.kernels import search as ks

In [2]:
# Load the dataset
dataset = CongressionalVoting()

In [3]:
data = {'Kernel': [], 'Test Error': []}
for i in range(5):
    print("Iteration {}".format(i))
    # Split train and test
    X_train, X_test, y_train, y_test = dataset.train_test_split(test_size=0.33, random_state=i)
    # Cross-validation
    cvf = cv.StratifiedKFold(y_train, 10)
    # Evaluate models
    for model_class in kh.RBF, kh.K0, kh.K1, kh.M3, kh.M4, kh.M5, kh.M6, kh.M7, kh.M9, kh.MC, kh.MD, kh.ME:
        model = model_class()
        print(model.name, end=', ')
        best_fit = model.train(cvf, X_train, y_train)
        results = model.test(best_fit, X_test, y_test)
        data['Kernel'].append(model.name)
        data['Test Error'].append(1 - results['test_score'])
    print()


Iteration 0
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 1
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 2
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 3
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 4
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 

In [4]:
df = pd.DataFrame(data=data)
df.groupby('Kernel')['Test Error'].mean()


Out[4]:
Kernel
K0        0.037500
K1        0.043056
M3        0.051389
M4        0.043056
M5        0.048611
M6        0.038889
M7        0.044444
M9        0.040278
MC        0.044444
MD        0.043056
ME        0.040278
RBF       0.029167
Name: Test Error, dtype: float64

In [5]:
df.boxplot(by='Kernel');