In [1]:
import numpy as np
import pandas as pd
import scipy

import sklearn
import sklearn.datasets
import sklearn.svm

%pylab inline

import warnings
warnings.filterwarnings('ignore')


Populating the interactive namespace from numpy and matplotlib

Load the data that showed surprising behavior


In [2]:
labeled_data, labeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm')
unlabeled_data, unlabeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm.t')

labeled_data = np.array(labeled_data.todense())
unlabeled_data = np.array(unlabeled_data.todense())

In [3]:
scatter(*labeled_data.T)
scatter(*unlabeled_data.T)


Out[3]:
<matplotlib.collections.PathCollection at 0x7f8111966e50>

Train some SVMs

We want to try out different tolerances, both in the primal and the dual


In [4]:
def train_svm(dual, termination_tolerance):
    
    svm = sklearn.svm.LinearSVC(loss='l2', penalty='l2', dual=dual, fit_intercept=False, tol=termination_tolerance)
    svm.fit(labeled_data, labeled_labels)
    
    return svm

In [5]:
termination_tolerances = 10.0**np.arange(-10,15)

dual_svms = [train_svm(dual=True, termination_tolerance=tol) for tol in termination_tolerances]
primal_svms = [train_svm(dual=False, termination_tolerance=tol) for tol in termination_tolerances]

In [6]:
accuracy = lambda svm: sklearn.metrics.accuracy_score(unlabeled_labels, svm.predict(unlabeled_data))

dual_accuracy = [accuracy(svm) for svm in dual_svms]
primal_accuracy = [accuracy(svm) for svm in primal_svms]

In [17]:
plt.semilogx()
plt.ylim((0,1))
plt.title('Dual Accuracy vs. Tolerance')
plt.plot(termination_tolerances, dual_accuracy)


Out[17]:
[<matplotlib.lines.Line2D at 0x7f81101acb90>]

In [18]:
plt.semilogx()
plt.ylim((0,1))
plt.title('Primal Accuracy vs. Tolerance')
plt.plot(termination_tolerances, primal_accuracy)


Out[18]:
[<matplotlib.lines.Line2D at 0x7f81100a4d10>]

It appears that the results for the primal and dual diverge when the tolerance is above a certain threshold


In [9]:
def plot_boundary(svm):
    # Adapted from http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html

    # create a mesh to plot in
    X = np.concatenate([labeled_data, unlabeled_data])
    y = np.concatenate([labeled_labels, unlabeled_labels])

    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    plt.figure(figsize=(12,12))

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    plt.show()

Plot boundaries

Dual


In [12]:
for termination_tolerance, svm in zip(termination_tolerances, dual_svms):
    print "termination_tolerance = %.2e" % termination_tolerance
    plot_boundary(svm)


termination_tolerance = 1.00e-10
termination_tolerance = 1.00e-09