More on classification: SVM and Boosting


In [ ]:
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform

import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.svm as svm
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

from patsy import dmatrices

import seaborn as sns
%matplotlib inline

Support Vector Machines (SVM)

Working with the wine dataset available here:

https://archive.ics.uci.edu/ml/datasets/Wine


In [ ]:
wine = pd.read_table("datasets/wine.data", sep=',')

attributes = ['region',
'Alcohol',
            'Malic acid',
            'Ash',
            'Alcalinity of ash',
            'Magnesium',
            'Total phenols',
            'Flavanoids',
            'Nonflavanoid phenols',
            'Proanthocyanins',
            'Color intensity',
            'Hue',
            'OD280/OD315 of diluted wines',
            'Proline']



wine.columns = attributes
X = wine[['Alcohol', 'Proline']].values
grape = wine.pop('region')
y = grape.values

print wine.info()

In [ ]:
wine.head()

In [ ]:
svc = svm.SVC(kernel='linear')
svc.fit(X, y)

Evaluating the fit of the classifier graphically


In [ ]:
from matplotlib.colors import ListedColormap
# Create color maps for 3-class classification problem, as with iris
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

def plot_estimator(estimator, X, y):
    
    try:
        X, y = X.values, y.values
    except AttributeError:
        pass
    
    estimator.fit(X, y)
    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.axis('tight')
    plt.axis('off')
    plt.tight_layout()

In [ ]:
plot_estimator(svc, X, y)

The SVM gets its name from the samples in the dataset from each class that lie closest to the other class. These training samples are called "support vectors" because changing their position in p-dimensional space would change the location of the decision boundary.

In scikits-learn, the indices of the support vectors for each class can be found in the supportvectors attribute of the SVC object. Here is a 2 class problem using only classes 1 and 2 in the wine dataset.

The support vectors are circled.


In [ ]:
# Extract classes 1 and 2
X, y = X[np.in1d(y, [1, 2])], y[np.in1d(y, [1, 2])]

plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0], 
           svc.support_vectors_[:, 1], 
           s=120, 
           facecolors='none', 
           linewidths=2,
           zorder=10)
Regularization

These two classes do not appear to be lineary separable.

For non-linearly separable classes we turn into regularization; regularization is tuned via the C parameter. In practice, a large C value means that the number of support vectors is small (less regularization), while a small C implies many support vectors (more regularization). scikit-learn sets a default value of C=1.


In [ ]:
svc = svm.SVC(kernel='linear', C=1e6)
plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80, 
            facecolors='none', linewidths=2, zorder=10)
plt.title('High C values: small number of support vectors')

svc = svm.SVC(kernel='linear', C=1e-2)
plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80, 
            facecolors='none', linewidths=2, zorder=10)
plt.title('Low C values: high number of support vectors')

Kernels

We can also choose from a suite of available kernels (linear, poly, rbf, sigmoid, precomputed) or a custom kernel can be passed as a function. Note that the radial basis function (rbf) kernel is just a Gaussian kernel, but with parameter $\gamma = \frac{1}{\sigma^2}$.

Linear Kernel

In [ ]:
svc_lin = svm.SVC(kernel='linear')
plot_estimator(svc_lin, X, y)
plt.scatter(svc_lin.support_vectors_[:, 0], svc_lin.support_vectors_[:, 1], 
            s=80, facecolors='none', linewidths=2, zorder=10)
plt.title('Linear kernel')
Polynomial Kernel

In [ ]:
#svc_poly = svm.SVC(kernel='poly', degree=3)
#plot_estimator(svc_poly, X, y)
#plt.scatter(svc_poly.support_vectors_[:, 0], svc_poly.support_vectors_[:, 1], 
#           s=80, facecolors='none', linewidths=2, zorder=10)
#plt.title('Polynomial kernel')
RBF kernel

In [ ]:
#svc_rbf = svm.SVC(kernel='rbf', gamma=1e-2)
#plot_estimator(svc_rbf, X, y)
#plt.scatter(svc_rbf.support_vectors_[:, 0], svc_rbf.support_vectors_[:, 1], 
#           s=80, facecolors='none', linewidths=2, zorder=10)
#plt.title('RBF kernel')

In [ ]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        wine.values, grape.values, test_size=0.4, random_state=0)

In [ ]:
f = svm.SVC(kernel='linear', C=1)
f.fit(X_train, y_train)
f.score(X_test, y_test)

In [ ]:
#mean accuracy
scores = cross_validation.cross_val_score(f, wine.values, grape.values, cv=5)
print scores

In [ ]:
#precision
cross_validation.cross_val_score(f, wine.values, grape.values, cv=5, scoring='precision')

Analyzing the iris dataset


In [ ]:
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = .02  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = svm.LinearSVC(C=C).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel', 'SVC with poly kernel']

fig = plt.figure(figsize=(10,10))

for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[i])

plt.show()

Boosting


In [ ]:
adaboost = AdaBoostClassifier(svm.SVC(kernel='linear'),
                         algorithm="SAMME",
                         n_estimators=200)
#adaboost.fit(wine.values,grape.values)

In [ ]:
scores = cross_validation.cross_val_score(adaboost, wine.values, grape.values, cv=5)
print scores
print scores.mean()

In [27]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("custom.css", "r").read()
    return HTML(styles)
css_styling()


Out[27]: