In [25]:
import numpy as np
import pandas as pd
import scipy

import sklearn
import sklearn.datasets

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [26]:
def sample_from_gaussian(mean, covariance, n_samples):
    return scipy.random.multivariate_normal(mean=mean, cov=covariance, size=(n_samples,))
    
def sample_unit_variance_gaussian(mean, n_samples):
    mean = np.array(mean)
    return sample_from_gaussian(mean=mean, n_samples=n_samples, covariance=np.identity(mean.shape[0]))

In [27]:
N_labeled = 1000
labeled_data = []
labeled_labels = []

N_unlabeled = 10000
unlabeled_data = []
unlabeled_labels = []

unlabeled_data.extend(sample_unit_variance_gaussian((0,5), N_unlabeled))
unlabeled_labels.extend([1]*N_unlabeled)
unlabeled_data.extend(sample_unit_variance_gaussian((0,-5), N_unlabeled))
unlabeled_labels.extend([-1]*N_unlabeled)

labeled_data.extend(sample_unit_variance_gaussian((5,0), N_labeled))
labeled_labels.extend([1]*N_labeled)
labeled_data.extend(sample_unit_variance_gaussian((-5,0), N_labeled))
labeled_labels.extend([-1]*N_labeled)

unlabeled_data = np.array(unlabeled_data)
labeled_data = np.array(labeled_data)

In [28]:
scatter(*labeled_data.T)
scatter(*unlabeled_data.T)


Out[28]:
<matplotlib.collections.PathCollection at 0x7f521f833bd0>

In [29]:
sklearn.datasets.dump_svmlight_file(unlabeled_data, unlabeled_labels, 'gmm_degenerate.svm.t', zero_based=False)
sklearn.datasets.dump_svmlight_file(labeled_data, labeled_labels, 'gmm_degenerate.svm', zero_based=False)

View decision boundary of linear SVM trained on labeled data


In [30]:
import sklearn
import sklearn.svm

In [23]:
svm = sklearn.svm.LinearSVC(loss='l2', dual=True)
svm.fit(labeled_data, labeled_labels)


Out[23]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [24]:
# create a mesh to plot in
X = np.concatenate([labeled_data, unlabeled_data])
y = np.concatenate([labeled_labels, unlabeled_labels])

h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

plt.figure(figsize=(12,12))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
plt.subplots_adjust(wspace=0.4, hspace=0.4)

Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
    
plt.show()



In [ ]: