Synthetic Dataset Overview


In [1]:
from sklearn import cross_validation as cv
from sklearn import svm

plt.style.use('ggplot')

from kcat.datasets import Synthetic

In [2]:
# Define parameters
sizes = (50, 100, 200, 400)
ps = np.linspace(0, 1, num=11)
repeat = 150
# Try all possible combinations
results = np.zeros((len(sizes), len(ps), repeat))
for i, m in enumerate(sizes):
    for j, p in enumerate(ps):
        print("{} {}".format(m, p), end=', ')
        for k in range(repeat):
            # Generate a new dataset
            Xq, Xc, y = Synthetic(m, n=25, c=2, p=p).data_arrays
            clf = svm.SVC(kernel='rbf')
            results[i][j][k] = cv.cross_val_score(clf, Xc, y, cv=5).mean()
# Invert results to show error rate instead of success rate
results = 1.0 - results


50 0.0, 50 0.1, 50 0.2, 50 0.30000000000000004, 50 0.4, 50 0.5, 50 0.6000000000000001, 50 0.7000000000000001, 50 0.8, 50 0.9, 50 1.0, 100 0.0, 100 0.1, 100 0.2, 100 0.30000000000000004, 100 0.4, 100 0.5, 100 0.6000000000000001, 100 0.7000000000000001, 100 0.8, 100 0.9, 100 1.0, 200 0.0, 200 0.1, 200 0.2, 200 0.30000000000000004, 200 0.4, 200 0.5, 200 0.6000000000000001, 200 0.7000000000000001, 200 0.8, 200 0.9, 200 1.0, 400 0.0, 400 0.1, 400 0.2, 400 0.30000000000000004, 400 0.4, 400 0.5, 400 0.6000000000000001, 400 0.7000000000000001, 400 0.8, 400 0.9, 400 1.0, 

In [3]:
# Plot error
figure(figsize=(10, 5))
styles = (':', '-.', '--', '-')
for i, m in enumerate(sizes):
    plot(ps, results[i].mean(axis=1), styles[i], linewidth=1.5, color=(0.7, 0.4, 0))
xlabel("P")
ylabel("Error")
ylim(0, 0.5)
legend(["Size {}".format(m) for m in sizes])
# title("Classification Error using RBF Kernel".format(m))


Out[3]:
<matplotlib.legend.Legend at 0x7f4ede379dd8>