In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import sklearn.datasets
import sklearn.svm
%pylab inline
import warnings
warnings.filterwarnings('ignore')
In [2]:
labeled_data, labeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm')
unlabeled_data, unlabeled_labels = sklearn.datasets.load_svmlight_file('weird_data/gmm_degenerate.svm.t')
labeled_data = np.array(labeled_data.todense())
unlabeled_data = np.array(unlabeled_data.todense())
In [3]:
scatter(*labeled_data.T)
scatter(*unlabeled_data.T)
Out[3]:
In [4]:
def train_svm(dual, termination_tolerance):
svm = sklearn.svm.LinearSVC(loss='l2', penalty='l2', dual=dual, fit_intercept=False, tol=termination_tolerance)
svm.fit(labeled_data, labeled_labels)
return svm
In [5]:
termination_tolerances = 10.0**np.arange(-10,15)
dual_svms = [train_svm(dual=True, termination_tolerance=tol) for tol in termination_tolerances]
primal_svms = [train_svm(dual=False, termination_tolerance=tol) for tol in termination_tolerances]
In [6]:
accuracy = lambda svm: sklearn.metrics.accuracy_score(unlabeled_labels, svm.predict(unlabeled_data))
dual_accuracy = [accuracy(svm) for svm in dual_svms]
primal_accuracy = [accuracy(svm) for svm in primal_svms]
In [17]:
plt.semilogx()
plt.ylim((0,1))
plt.title('Dual Accuracy vs. Tolerance')
plt.plot(termination_tolerances, dual_accuracy)
Out[17]:
In [18]:
plt.semilogx()
plt.ylim((0,1))
plt.title('Primal Accuracy vs. Tolerance')
plt.plot(termination_tolerances, primal_accuracy)
Out[18]:
It appears that the results for the primal and dual diverge when the tolerance is above a certain threshold
In [9]:
def plot_boundary(svm):
# Adapted from http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
# create a mesh to plot in
X = np.concatenate([labeled_data, unlabeled_data])
y = np.concatenate([labeled_labels, unlabeled_labels])
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
plt.figure(figsize=(12,12))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
plt.subplots_adjust(wspace=0.4, hspace=0.4)
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.show()
In [12]:
for termination_tolerance, svm in zip(termination_tolerances, dual_svms):
print "termination_tolerance = %.2e" % termination_tolerance
plot_boundary(svm)
In [13]:
for termination_tolerance, svm in zip(termination_tolerances, primal_svms):
print "termination_tolerance = %.2e" % termination_tolerance
plot_boundary(svm)
In [ ]: