Promoter Gene Dataset


In [1]:
from sklearn import svm
from sklearn import cross_validation as cv
import pandas as pd

plt.style.use('ggplot')

from kcat.datasets import Promoter
from kcat.kernels import helpers as kh
from kcat.kernels import search as ks

In [2]:
# Load the dataset
dataset = Promoter()

In [3]:
data = {'Kernel': [], 'Test Error': []}
for i in range(20):
    print("Iteration {}".format(i))
    # Split train and test
    X_train, X_test, y_train, y_test = dataset.train_test_split(test_size=0.33, random_state=i)
    # Cross-validation
    cvf = cv.StratifiedKFold(y_train, 10)
    # Evaluate models
    for model_class in kh.RBF, kh.K0, kh.K1, kh.M3, kh.M4, kh.M5, kh.M6, kh.M7, kh.M9, kh.MC, kh.MD, kh.ME:
        model = model_class()
        print(model.name, end=', ')
        best_fit = model.train(cvf, X_train, y_train)
        results = model.test(best_fit, X_test, y_test)
        data['Kernel'].append(model.name)
        data['Test Error'].append(1 - results['test_score'])
    print()


Iteration 0
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 1
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 2
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 3
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 4
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 5
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 6
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 7
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 8
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 9
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 10
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 11
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 12
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 13
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 14
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 15
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 16
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 17
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 18
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 
Iteration 19
RBF, K0, K1, M3, M4, M5, M6, M7, M9, MC, MD, ME, 

In [4]:
df = pd.DataFrame(data=data)
df.groupby('Kernel')['Test Error'].mean()


Out[4]:
Kernel
K0        0.101429
K1        0.145714
M3        0.130000
M4        0.128571
M5        0.112857
M6        0.127143
M7        0.092857
M9        0.120000
MC        0.092857
MD        0.124286
ME        0.108571
RBF       0.097143
Name: Test Error, dtype: float64

In [5]:
univ_kernels = (df.Kernel == 'K0') | (df.Kernel == 'K1') | (df.Kernel == 'RBF')
df[univ_kernels].boxplot(by='Kernel');



In [6]:
df.boxplot(by='Kernel');



In [6]: