In [1]:
%matplotlib inline
import numpy as np
# class '0'
features_0 = [-1.5, 0] + np.random.randn(100, 2)
labels_0 = np.zeros(100)
# class '1'
features_1 = [+1.5, 0] + np.random.randn(100, 2)
labels_1 = np.ones(100)
# show the training set with matplotlib
import matplotlib.pyplot as plt
plt.figure()
plt.plot(features_0[:, 0], features_0[:, 1], 'r+') # r+ means red pluses
plt.plot(features_1[:, 0], features_1[:, 1], 'bo') # bo means blue circles
Out[1]:
In the following cell, we are just defining what is needed to train the classifier and display it
In [2]:
# Merge the two classes in a single set
features = np.concatenate((features_0, features_1)) # for features
labels = np.concatenate((labels_0, labels_1)) # for labels
# Define a mesh grid on which we will test the classifiers
mesh_size = 0.1
x_min, x_max = features[:, 0].min() - 1, features[:, 0].max() + 1
y_min, y_max = features[:, 1].min() - 1, features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_size),
np.arange(y_min, y_max, mesh_size))
# Define a function that shows the
def show_results(classifier, title):
Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(features[:, 0], features[:, 1], c=labels, cmap=plt.cm.Paired)
# c=labels means that the color will correspond to the label
# cmap=plt.cm.Paired is a colormap less agressive than the blue/red default color (which is somewhat flashy)
plt.title(title)
In [3]:
# in scikit-learn, the SVM classifier is referred to as SVC (Support Vector Classifier)
from sklearn.svm import SVC
In [4]:
my_C = 10 # the soft-margin trade-off parameter
my_kernel = 'linear' # the kernel
my_linear_classifier = SVC(kernel = my_kernel, C = my_C).fit(features, labels) # train the classifier
show_results(my_linear_classifier, my_kernel) # display the results
In [5]:
my_kernel = 'rbf'
my_gamma = 0.5
my_gaussian_classifier = SVC(kernel = my_kernel, C = my_C, gamma = my_gamma).fit(features, labels)
show_results(my_gaussian_classifier, my_kernel)
In [6]:
my_kernel = 'poly'
my_degree = 5
my_polynomial_classifier = SVC(kernel = my_kernel, C = my_C, degree = my_degree).fit(features, labels)
show_results(my_polynomial_classifier, my_kernel)
In [7]:
from sklearn.tree import DecisionTreeClassifier
my_tree_classifier = DecisionTreeClassifier().fit(features, labels)
show_results(my_tree_classifier, 'tree')
There are several ways to evaluate a model without having to visualize the data. The accuracy is the simplest of them: it is the proportion of well classified samples. Standard evaluation metrics can be found in sklearn.metrics.
Note: In this lab session, we won't detail much the model evaluation. There will be a session about entirely dedicated to model evaluation and selection.
In [8]:
from sklearn import metrics
predicted_labels = my_linear_classifier.predict(features)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels, predicted_labels)
predicted_labels = my_gaussian_classifier.predict(features)
print "accuracy of the rbf classifier: ", metrics.accuracy_score(labels, predicted_labels)
predicted_labels = my_polynomial_classifier.predict(features)
print "accuracy of the polynomial classifier: ", metrics.accuracy_score(labels, predicted_labels)
In [9]:
# Generate the data
features_0 = [-2.0, 0] + np.random.randn(100, 2)
features_1 = [+2.0, 0] + np.random.randn(100, 2)
features_2 = [0, +2.0] + np.random.randn(100, 2)
labels_0 = np.zeros(100)
labels_1 = np.ones(100)
labels_2 = 2 * np.ones(100)
# Merge the data
features = np.concatenate((features_0, features_1, features_2))
labels = np.concatenate((labels_0, labels_1, labels_2))
# Re-define the meshgrid
mesh_size = 0.1
x_min, x_max = features[:, 0].min() - 1, features[:, 0].max() + 1
y_min, y_max = features[:, 1].min() - 1, features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_size),
np.arange(y_min, y_max, mesh_size))
There are 2 main ways to extend a binary classifier (such as the SVM) to the multiclass case: "one versus one" and "one versus all" (or "one versus rest").
The SVC classifier handles both extensions with the decision_function_shape parameter (either equal to 'ovo' or 'ovr'). If the decision_function_shape option is not set, 'ovr' will be set by default.
In [10]:
# feel free to play with the parameters (kernel, C, gamma, degree).
my_C = 1.0
my_kernel = 'linear'
my_linear_classifier = SVC(kernel = my_kernel, C = my_C, decision_function_shape = 'ovo').fit(features, labels)
show_results(my_linear_classifier, my_kernel)
In [11]:
my_linear_classifier = SVC(kernel = my_kernel, C = my_C, decision_function_shape = 'ovr').fit(features, labels)
show_results(my_linear_classifier, my_kernel)
In [12]:
my_kernel = 'rbf'
my_gamma = 0.5
my_gaussian_classifier = SVC(kernel = my_kernel, C = my_C, gamma = my_gamma).fit(features, labels)
show_results(my_gaussian_classifier, my_kernel)
In [13]:
my_kernel = 'poly'
my_degree = 5
my_polynomial_classifier = SVC(kernel = my_kernel, C = my_C, degree = my_degree).fit(features, labels)
show_results(my_polynomial_classifier, my_kernel)
In [14]:
from sklearn.tree import DecisionTreeClassifier
my_tree_classifier = DecisionTreeClassifier().fit(features, labels)
show_results(my_tree_classifier, 'tree')
In [16]:
from sklearn.tree import export_graphviz
export_graphviz(my_tree_classifier, out_file='tree.dot')
# The following is OS dependent
# Ubuntu (you'll need to install GraphViz: 'apt-get install graphviz')
# PS/PDF format:
!dot -Tps tree.dot -o tree.ps
!evince tree.ps
# MacOS: You'll also need GraphViz: 'brew install graphviz'
# !dot -Tpng tree.dot -o tree.png
# !open tree.png # works with MacOS
# Note: The "!" at the beginning of a Python instruction means that what follow will be run as if you were in a terminal.
# (Hence, these commands depend on your OS and what's installed on it).
In this section, we will apply the classification algorithms we have seen to a standard dataset: The Iris dataset (https://en.wikipedia.org/wiki/Iris_flower_data_set).
It is a really small dataset which can be loaded this way:
In [17]:
from sklearn import datasets
iris = datasets.load_iris()
The iris object has several fields:
In [18]:
print iris
We can show the dataset description:
In [19]:
print iris.DESCR
The fields that interest us the most concern the features (data and feature_names), the labels (target and target_names).
In [20]:
features = iris.data
print features
In [21]:
features_names = iris.feature_names
print features_names
In [22]:
labels = iris.target
print labels
In [23]:
label_names = iris.target_names
print label_names
In [24]:
plt.figure()
plt.scatter(features[:, 0], features[:, 1], c=labels)
Out[24]:
In [25]:
plt.figure()
plt.scatter(features[:, 2], features[:, 3], c=labels)
Out[25]:
In [26]:
C = 1.0
my_linear_classifier = SVC(kernel='linear', C=C).fit(features, labels)
In [27]:
predicted_labels = my_linear_classifier.predict(features)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels, predicted_labels)
In [28]:
from sklearn.cross_validation import train_test_split
# or, depending on your sklearn version
# from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.5)
In [29]:
my_linear_classifier = SVC(kernel='linear', C=C).fit(features_train, labels_train)
In [30]:
predicted_labels_train = my_linear_classifier.predict(features_train)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels_train, predicted_labels_train)
predicted_labels_test = my_linear_classifier.predict(features_test)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels_test, predicted_labels_test)
In [31]:
from sklearn.datasets import load_digits
digits = load_digits()
In [32]:
print digits.DESCR
In [33]:
print digits.target_names
In [34]:
features = digits.data
labels = digits.target
images = digits.images
In [35]:
print images
In [36]:
print images[0]
In [37]:
features[0, :]
Out[37]:
In [38]:
features.shape
Out[38]:
In [39]:
labels.shape
Out[39]:
In [40]:
for i in range(4):
plt.subplot(1, 4, i + 1)
plt.axis('off')
plt.imshow(images[i], cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Label: %i' % labels[i])
In [41]:
C = 0.01
my_linear_classifier = SVC(kernel='linear', C=C).fit(features, labels)
In [42]:
predicted_labels = my_linear_classifier.predict(features)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels, predicted_labels)
In [43]:
print my_linear_classifier
In [44]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.5)
my_linear_classifier = SVC(kernel='rbf', C=C).fit(features_train, labels_train)
predicted_labels_train = my_linear_classifier.predict(features_train)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels_train, predicted_labels_train)
predicted_labels_test = my_linear_classifier.predict(features_test)
print "accuracy of the linear classifier: ", metrics.accuracy_score(labels_test, predicted_labels_test)