In [2]:
from sklearn import cross_validation as cv
from sklearn import svm
plt.style.use('ggplot')
from kcat.datasets import GMonks
Let's see how a classifier behaves when changing the dataset parameters.
In [6]:
# Define parameters
sizes = (50, 100, 200, 400)
attrs = (1, 2, 3, 4, 5, 6)
repeat = 100
# Try all possible combinations
results = np.zeros((len(sizes), len(attrs), repeat))
for i, m in enumerate(sizes):
for j, d in enumerate(attrs):
print("{} {}".format(m, d), end=', ')
for k in range(repeat):
# Generate a new dataset
Xq, Xc, y = GMonks(m=m, d=d).data_arrays
clf = svm.SVC(kernel='rbf')
results[i][j][k] = cv.cross_val_score(clf, Xc, y, cv=5).mean()
# Invert results to show error rate instead of success rate
results = 1.0 - results
In [11]:
# Plot error
figure(figsize=(10, 5))
styles = (':', '-.', '--', '-')
for i, m in enumerate(sizes):
plot(attrs, results[i].mean(axis=1), styles[i], linewidth=1.5, color=(0.7, 0.4, 0))
xlabel("P")
ylabel("Error")
ylim(0.2, 0.6)
legend(["Size {}".format(m) for m in sizes])
# title("Classification Error using RBF Kernel".format(m))
Out[11]:
In [70]:
# Plot error
figure(figsize=(8, 4), dpi=150)
colors = ('r', 'g', 'b', 'k')
for i, m in enumerate(sizes):
plt.plot(attrs, results[i].mean(axis=1), '.-', color=colors[i])
plt.xlabel("Number of attribues / 6")
plt.ylabel("Error")
plt.ylim(0, 1)
plt.legend(["Size {}".format(m) for m in sizes], bbox_to_anchor=(1, 1))
plt.title("SVM with RBF Kernel classification error for different sizes".format(m))
Out[70]:
For smaller datasets with fewer attributes, it seems that the error rate varies a bit from one dataset to another (but all kernels perform similarly), but for big datasets or with a lot of parameters it seems that the error rate stabilizes at around 0.4. Thus, generating datasets of different sizes or number of attributes is unlike to give any insight to how the kernels perform.
It is therefore reasonable to use a small dataset size with not too many attributes to keep computations fast.
The previous section was a quick glance and the kernel was called with the default parameters, which is not useful.
Let's now propertly fit the SVM to the dataset and let's see how well it performs on some independent examples.
As seen, there is no point in making the dataset too big, so 200 examples (100 for training and 100 for testing) with d=1 (6 attributes) will do.
In [38]:
X, y, dummy_encoder = gmonks(200, d=1)
# Encode attributes in dummy variable for for RBF kernel
Xb = dummy_encoder(X)
# Split train and test
X_train, X_test, y_train, y_test = cv.train_test_split(Xb, y, train_size=100, test_size=100)
# Use stratified 5-fold cross-validation
cvf = cv.StratifiedKFold(y_train, 5)
Now define the search space, specifying an array of values for each parameter.
In [48]:
Cs = 10.0 ** np.arange(-1, 5)
gammas = 1.4 ** np.arange(-20, 0)
# A parameter grid is passed to the library to perform the search
param_grid = dict(C=Cs, gamma=gammas)
Find the best parameters and use them to fit a model.
In [49]:
estimator = svm.SVC(kernel='rbf')
search_result = gs.GridSearchCV(estimator, param_grid=param_grid, cv=cvf)
search_result.fit(X_train, y_train)
"Best score: {:0.3f}, parameters: {}".format(search_result.best_score_, search_result.best_params_)
Out[49]:
Plotting the scores for each parameter allows to see if the search space is adequate.
In [50]:
# Draw heatmap of accuracy as a function of gamma and C
scores = [x[1] for x in search_result.grid_scores_]
scores = np.array(scores).reshape(len(Cs), len(gammas))
figure(figsize=(12, 4), dpi=150)
plt.imshow(scores, interpolation='nearest', cmap=cm.PiYG)
plt.xlabel('gamma')
plt.xticks(np.arange(len(gammas)), gammas, rotation=45)
plt.ylabel('C')
plt.yticks(np.arange(len(Cs)), Cs, rotation=45)
plt.colorbar()
None
In [4]:
# See how RBF performs with different parameters:
X, y, bincoder = gmonks(200, d=1)
Xb = bincoder(X)
clf = svm.SVC(kernel='rbf')
costs = 10.0 ** np.arange(-1, 6)
gammas = 2.0 ** np.arange(-13, 0)
params = dict(C=costs, gamma=gammas)
# GridSearch takes care of finding the best parameters using cross-validation:
grid = gs.GridSearchCV(clf, param_grid=params, cv=10)
grid.fit(Xb, y)
rbf_results = grid.best_params_
"Best score: {:0.3f}, parameters: {}".format(grid.best_score_, grid.best_params_)
Out[4]:
In [5]:
# Draw heatmap of accuracy as a function of gamma and C
# using only the actual scores:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(costs), len(gammas))
figure(figsize=(12, 4), dpi=150)
plt.imshow(scores, interpolation='nearest', cmap=cm.PiYG)
plt.xlabel('gamma')
plt.xticks(np.arange(len(gammas)), gammas, rotation=45)
plt.ylabel('C')
plt.yticks(np.arange(len(costs)), costs, rotation=45)
plt.colorbar()
None
In [6]:
# Generate a dataset and split the data in train and test:
X, y, bincoder = gmonks(200, 1)
X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.5)
Xb_train = bincoder(X_train)
Xb_test = bincoder(X_test)
pgen = get_pgen(X_train)
cvf = cv.StratifiedKFold(y_train, 5)
In [7]:
Cs = 10.0 ** np.arange(-1, 6)
gammas = 2.0 ** np.arange(-12, 1)
gs_rbf = gs.GridSearchCV(svm.SVC(kernel='rbf'), param_grid=dict(C=Cs, gamma=gammas), cv=cvf)
gs_rbf.fit(Xb_train, y_train)
"Best score: {:0.3f}, parameters: {}".format(gs_rbf.best_score_, gs_rbf.best_params_)
Out[7]:
In [8]:
Cs = 10.0 ** np.arange(-1, 6)
functions = [
('ident', 'ident'),
('ident', 'f1'),
('f1', 'ident'),
]
gammas = 2.0 ** np.arange(-4, 3)
gs_k0 = GridSearchK0(svm.SVC(kernel='precomputed', max_iter=2**15), functions, gammas, param_grid=dict(C=Cs), cv=cvf)
gs_k0.fit(X_train, y_train)
"Best score: {:0.3f}, parameters: {}".format(gs_k0.best_score_, gs_k0.best_params_)
Out[8]:
In [9]:
Cs = 10.0 ** np.arange(-1, 6)
functions = [
('ident', 'ident'),
('ident', 'f1'),
('ident', 'f2'),
('f1', 'ident'),
]
gammas = 2.0 ** np.arange(-4, 3)
alphas = 2.0 ** np.arange(-2, 3)
gs_k1 = GridSearchK1(svm.SVC(kernel='precomputed', max_iter=2**15), alphas, functions, gammas, param_grid=dict(C=Cs), cv=cvf)
gs_k1.fit(X_train, y_train, pgen)
"Best score: {:0.3f}, parameters: {}".format(gs_k1.best_score_, gs_k1.best_params_)
Out[9]:
In [11]:
# RBF
y_predict = gs_rbf.best_estimator_.predict(Xb_test)
"Score: {:0.3f}".format((y_predict == y_test).mean())
Out[11]:
In [12]:
# K0
gram = fast_k0(X_test, X_train, **gs_k0.best_params_[1])
y_predict = gs_k0.best_estimator_.predict(gram)
"Score: {:0.3f}".format((y_predict == y_test).mean())
Out[12]:
In [13]:
# K1
gram = fast_k1(X_test, X_train, pgen, **gs_k1.best_params_[1])
y_predict = gs_k1.best_estimator_.predict(gram)
"Score: {:0.3f}".format((y_predict == y_test).mean())
Out[13]:
In [14]: