Classifier Tests

The goal is to verify that the sklearn SGD and the Vowpal Wabbit classifiers are working as expected.

Simple binary problem.


In [89]:
from sklearn.datasets import make_classification
import pandas as pd

X, y = make_classification(n_samples=500, n_features=3, n_redundant=0, n_informative=2, 
                           n_clusters_per_class=1, hypercube=True, random_state=0)
            
# Make the dataset slightly unbalanced
num_pos = y.sum()
pos_subsample_factor = 0.7
ind = np.where(y == 0)[0].tolist() + np.where(y == 1)[0].tolist()[:int(num_pos * pos_subsample_factor)]
np.random.seed(0)
np.random.shuffle(ind)
X = X[ind]
y = y[ind]

df = pd.DataFrame(X)
pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], marker='o', c=y, s=40)


Out[89]:
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x117f5da50>

In [98]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score

param_grid = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
    'l1_ratio': [0.15, 0.3, 0.45, 0.6]
}
clf = SGDClassifier(loss='log', n_iter=10, penalty='elasticnet', class_weight='auto')
for x in [X[:, 0][:, np.newaxis], X[:, 1][:, np.newaxis], X[:, 2][:, np.newaxis], X[:, :2], X]:
    grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
    y_pred = grid_clf.predict_proba(x)[:, 1]
    ap = average_precision_score(y, y_pred)
    print(ap)


0.450636875365
0.516611285973
0.906265230816
0.394680965228
0.92406190993

In [97]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score

param_grid = {
    'C': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
}
clf = SVC(kernel='linear', class_weight='auto', probability=True)
for x in [X[:, 0][:, np.newaxis], X[:, 1][:, np.newaxis], X[:, 2][:, np.newaxis], X[:, :2], X]:
    grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
    y_pred = grid_clf.predict_proba(x)[:, 1]
    ap = average_precision_score(y, y_pred)
    print(ap)


0.391012233939
0.484194732752
0.906265230816
0.392002016435
0.77941989473

In [92]:
# Save the features and labels to test support directory
%load_ext autoreload
%autoreload 2
import vislab.util
import vislab.vw3
import vislab.tests.test_context

dirname = vislab.util.makedirs(
    vislab.tests.test_context.support_dirname + '/simple')
vislab.vw3.write_data_in_vw_format(df[[0, 1]], 'first', dirname + '/first.txt')
vislab.vw3.write_data_in_vw_format(df[[2]], 'second', dirname + '/second.txt.gz')

y[y == 0] = -1
label_df = pd.DataFrame({'label': y, 'importance': np.ones_like(y)})
label_df.index = label_df.index.astype(str)
label_df.to_hdf(dirname + '/label_df.h5', 'df', mode='w')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Simple multiclass problem: Iris


In [119]:
import sklearn.datasets
import sklearn.cross_validation
iris = sklearn.datasets.load_iris()
X, Xt, y, yt = sklearn.cross_validation.train_test_split(iris.data, iris.target)

df = pd.DataFrame(X)
pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y)


Out[119]:
array([[<matplotlib.axes.AxesSubplot object at 0x119731dd0>,
        <matplotlib.axes.AxesSubplot object at 0x11b4c9050>,
        <matplotlib.axes.AxesSubplot object at 0x117f7d8d0>,
        <matplotlib.axes.AxesSubplot object at 0x11aa09750>],
       [<matplotlib.axes.AxesSubplot object at 0x117cb5b90>,
        <matplotlib.axes.AxesSubplot object at 0x117d729d0>,
        <matplotlib.axes.AxesSubplot object at 0x117e82610>,
        <matplotlib.axes.AxesSubplot object at 0x117cda350>],
       [<matplotlib.axes.AxesSubplot object at 0x11a531750>,
        <matplotlib.axes.AxesSubplot object at 0x117d4e150>,
        <matplotlib.axes.AxesSubplot object at 0x11c5d6e90>,
        <matplotlib.axes.AxesSubplot object at 0x11c703410>],
       [<matplotlib.axes.AxesSubplot object at 0x11b4dc1d0>,
        <matplotlib.axes.AxesSubplot object at 0x10cd2e710>,
        <matplotlib.axes.AxesSubplot object at 0x11b7897d0>,
        <matplotlib.axes.AxesSubplot object at 0x1194e5190>]], dtype=object)

In [133]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score, accuracy_score

param_grid = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
    'l1_ratio': [0.15, 0.3, 0.45, 0.6]
}
clf = SGDClassifier(loss='log', n_iter=10, penalty='elasticnet', class_weight='auto')

for x, xt in [
            (X[:, 0][:, np.newaxis], Xt[:, 0][:, np.newaxis]),
            (X[:, 1][:, np.newaxis], Xt[:, 1][:, np.newaxis]),
            (X[:, 2][:, np.newaxis], Xt[:, 2][:, np.newaxis]),
            (X[:, :2], Xt[:, :2]),
            (X, Xt)
        ]:
    grid_clf = GridSearchCV(clf, param_grid).fit(x, y)
    yt_pred = grid_clf.predict_proba(xt)
    aps = [average_precision_score(yt == i, yt_pred[:, i]) for i in range(3)]
    aps.append(np.mean(aps))
    print('aps', aps)
    yt_pred_max = yt_pred.argmax(1)
    print('accuracy', accuracy_score(yt, yt_pred_max))
    print


('aps', [0.72368421052631582, 0.64473684210526316, 0.63157894736842102, 0.66666666666666663])
('accuracy', 0.28947368421052633)

('aps', [0.90134435567872107, 0.47689588676430783, 0.52129174875691897, 0.6331773303999827])
('accuracy', 0.52631578947368418)

('aps', [1.0, 0.99621212121212133, 0.99545454545454537, 0.99722222222222223])
('accuracy', 0.81578947368421051)

('aps', [1.0, 0.48127458040911453, 0.65506201723113489, 0.7121121992134164])
('accuracy', 0.73684210526315785)

('aps', [1.0, 0.89830768142927853, 0.73769119769119762, 0.87866629304015875])
('accuracy', 0.94736842105263153)


In [127]:
# Save the features and labels to test support directory
%load_ext autoreload
%autoreload 2
import vislab.util
import vislab.vw3
import vislab.dataset
import vislab.tests.test_context

dirname = vislab.util.makedirs(
    vislab.tests.test_context.support_dirname + '/iris')
vislab.vw3.write_data_in_vw_format(df, 'all', dirname + '/all.txt.gz')

label_df = pd.DataFrame({'label': [str(_) for _ in y]})
label_df.index = label_df.index.astype(str)
label_df = vislab.dataset.get_bool_df(label_df, 'label')
label_df.to_hdf(dirname + '/label_df.h5', 'df', mode='w')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Simple multiclass problem: Iris with only first two features.


In [ ]:
vw = VW('temp_vw')
vw.fit(X, Y)
y_pred = vw.predict(X)
print y_pred
print sklearn.metrics.accuracy_score(Y, y_pred)

In [118]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
Y = iris.target

h = .02  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter

classifiers = {
    'LinearSVC (linear kernel)': svm.LinearSVC(C=C).fit(X, Y),
    'SVC with 2-poly kernel': svm.SVC(kernel='poly', degree=2, C=C).fit(X, Y),
    'VW with hinge loss': VW('temp_vw').fit(X, Y),
    'VW with hinge loss and 2-poly expansion': VW('temp_vw_q').fit(X, Y, quadratic='::')
}

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

fig = plt.figure(figsize=(12,12))
i = 0
for title, clf in classifiers.iteritems():
    i += 1
    
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    ax = fig.add_subplot(2, 2, i)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    ax.axis('off')

    # Plot also the training points
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor='k')

    ax.set_title(title)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-118-57c70c283ebc> in <module>()
     18     'LinearSVC (linear kernel)': svm.LinearSVC(C=C).fit(X, Y),
     19     'SVC with 2-poly kernel': svm.SVC(kernel='poly', degree=2, C=C).fit(X, Y),
---> 20     'VW with hinge loss': VW('temp_vw').fit(X, Y),
     21     'VW with hinge loss and 2-poly expansion': VW('temp_vw_q').fit(X, Y, quadratic='::')
     22 }

NameError: name 'VW' is not defined

In [ ]: