Synthetic Dataset Overview



In [1]:

    
from sklearn import cross_validation as cv
from sklearn import svm

plt.style.use('ggplot')

from kcat.datasets import Synthetic



In [2]:

    
# Define parameters
sizes = (50, 100, 200, 400)
ps = np.linspace(0, 1, num=11)
repeat = 150
# Try all possible combinations
results = np.zeros((len(sizes), len(ps), repeat))
for i, m in enumerate(sizes):
    for j, p in enumerate(ps):
        print("{} {}".format(m, p), end=', ')
        for k in range(repeat):
            # Generate a new dataset
            Xq, Xc, y = Synthetic(m, n=25, c=2, p=p).data_arrays
            clf = svm.SVC(kernel='rbf')
            results[i][j][k] = cv.cross_val_score(clf, Xc, y, cv=5).mean()
# Invert results to show error rate instead of success rate
results = 1.0 - results









    



50 0.0, 50 0.1, 50 0.2, 50 0.30000000000000004, 50 0.4, 50 0.5, 50 0.6000000000000001, 50 0.7000000000000001, 50 0.8, 50 0.9, 50 1.0, 100 0.0, 100 0.1, 100 0.2, 100 0.30000000000000004, 100 0.4, 100 0.5, 100 0.6000000000000001, 100 0.7000000000000001, 100 0.8, 100 0.9, 100 1.0, 200 0.0, 200 0.1, 200 0.2, 200 0.30000000000000004, 200 0.4, 200 0.5, 200 0.6000000000000001, 200 0.7000000000000001, 200 0.8, 200 0.9, 200 1.0, 400 0.0, 400 0.1, 400 0.2, 400 0.30000000000000004, 400 0.4, 400 0.5, 400 0.6000000000000001, 400 0.7000000000000001, 400 0.8, 400 0.9, 400 1.0,



In [3]:

    
# Plot error
figure(figsize=(10, 5))
styles = (':', '-.', '--', '-')
for i, m in enumerate(sizes):
    plot(ps, results[i].mean(axis=1), styles[i], linewidth=1.5, color=(0.7, 0.4, 0))
xlabel("P")
ylabel("Error")
ylim(0, 0.5)
legend(["Size {}".format(m) for m in sizes])
# title("Classification Error using RBF Kernel".format(m))









    Out[3]:





<matplotlib.legend.Legend at 0x7f4ede379dd8>