In [1]:

    
import numpy as np
import pandas as pd
import scipy

import sklearn
import sklearn.datasets
import sklearn.svm

%pylab inline

import warnings
warnings.filterwarnings('ignore')









    



Populating the interactive namespace from numpy and matplotlib

Load the data that showed surprising behavior



In [2]:

    
labeled_data, labeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm')
unlabeled_data, unlabeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm.t')

labeled_data = np.array(labeled_data.todense())
unlabeled_data = np.array(unlabeled_data.todense())



In [3]:

    
scatter(*labeled_data.T)
scatter(*unlabeled_data.T)









    Out[3]:





<matplotlib.collections.PathCollection at 0x7f8111966e50>

Train some SVMs

We want to try out different tolerances, both in the primal and the dual



In [4]:

    
def train_svm(dual, termination_tolerance):
    
    svm = sklearn.svm.LinearSVC(loss='l2', penalty='l2', dual=dual, fit_intercept=False, tol=termination_tolerance)
    svm.fit(labeled_data, labeled_labels)
    
    return svm



In [5]:

    
termination_tolerances = 10.0**np.arange(-10,15)

dual_svms = [train_svm(dual=True, termination_tolerance=tol) for tol in termination_tolerances]
primal_svms = [train_svm(dual=False, termination_tolerance=tol) for tol in termination_tolerances]



In [6]:

    
accuracy = lambda svm: sklearn.metrics.accuracy_score(unlabeled_labels, svm.predict(unlabeled_data))

dual_accuracy = [accuracy(svm) for svm in dual_svms]
primal_accuracy = [accuracy(svm) for svm in primal_svms]



In [17]:

    
plt.semilogx()
plt.ylim((0,1))
plt.title('Dual Accuracy vs. Tolerance')
plt.plot(termination_tolerances, dual_accuracy)









    Out[17]:





[<matplotlib.lines.Line2D at 0x7f81101acb90>]



In [18]:

    
plt.semilogx()
plt.ylim((0,1))
plt.title('Primal Accuracy vs. Tolerance')
plt.plot(termination_tolerances, primal_accuracy)









    Out[18]:





[<matplotlib.lines.Line2D at 0x7f81100a4d10>]

It appears that the results for the primal and dual diverge when the tolerance is above a certain threshold



In [9]:

    
def plot_boundary(svm):
    # Adapted from http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html

    # create a mesh to plot in
    X = np.concatenate([labeled_data, unlabeled_data])
    y = np.concatenate([labeled_labels, unlabeled_labels])

    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    plt.figure(figsize=(12,12))

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    plt.show()

Plot boundaries

Dual



In [12]:

    
for termination_tolerance, svm in zip(termination_tolerances, dual_svms):
    print "termination_tolerance = %.2e" % termination_tolerance
    plot_boundary(svm)









    



termination_tolerance = 1.00e-10






    












    



termination_tolerance = 1.00e-09






    












    



termination_tolerance = 1.00e-08






    












    



termination_tolerance = 1.00e-07






    












    



termination_tolerance = 1.00e-06






    












    



termination_tolerance = 1.00e-05






    












    



termination_tolerance = 1.00e-04






    












    



termination_tolerance = 1.00e-03






    












    



termination_tolerance = 1.00e-02






    












    



termination_tolerance = 1.00e-01






    












    



termination_tolerance = 1.00e+00






    












    



termination_tolerance = 1.00e+01






    












    



termination_tolerance = 1.00e+02






    












    



termination_tolerance = 1.00e+03






    












    



termination_tolerance = 1.00e+04






    












    



termination_tolerance = 1.00e+05






    












    



termination_tolerance = 1.00e+06






    












    



termination_tolerance = 1.00e+07






    












    



termination_tolerance = 1.00e+08






    












    



termination_tolerance = 1.00e+09






    












    



termination_tolerance = 1.00e+10






    












    



termination_tolerance = 1.00e+11






    












    



termination_tolerance = 1.00e+12






    












    



termination_tolerance = 1.00e+13






    












    



termination_tolerance = 1.00e+14

Primal



In [13]:

    
for termination_tolerance, svm in zip(termination_tolerances, primal_svms):
    print "termination_tolerance = %.2e" % termination_tolerance
    plot_boundary(svm)









    



termination_tolerance = 1.00e-10






    












    



termination_tolerance = 1.00e-09






    












    



termination_tolerance = 1.00e-08






    












    



termination_tolerance = 1.00e-07






    












    



termination_tolerance = 1.00e-06






    












    



termination_tolerance = 1.00e-05






    












    



termination_tolerance = 1.00e-04






    












    



termination_tolerance = 1.00e-03






    












    



termination_tolerance = 1.00e-02






    












    



termination_tolerance = 1.00e-01






    












    



termination_tolerance = 1.00e+00






    












    



termination_tolerance = 1.00e+01






    












    



termination_tolerance = 1.00e+02






    












    



termination_tolerance = 1.00e+03






    












    



termination_tolerance = 1.00e+04






    












    



termination_tolerance = 1.00e+05






    












    



termination_tolerance = 1.00e+06






    












    



termination_tolerance = 1.00e+07






    












    



termination_tolerance = 1.00e+08






    












    



termination_tolerance = 1.00e+09






    












    



termination_tolerance = 1.00e+10






    












    



termination_tolerance = 1.00e+11






    












    



termination_tolerance = 1.00e+12






    












    



termination_tolerance = 1.00e+13






    












    



termination_tolerance = 1.00e+14



In [ ]: