In [1]:
from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# use seaborn for better matplotlib styles
import seaborn; seaborn.set(style='white')
In [2]:
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
plt.imshow to plot several of the images. How many pixels are in each image?sklearn.metrics.train_test_split to split the data into a training set and a test set.
In [3]:
faces.keys()
Out[3]:
In [4]:
n_samples, n_features = faces.data.shape
print(n_samples, n_features)
In [5]:
print(faces.target_names)
In [6]:
fig, axes = plt.subplots(4, 8, figsize=(12, 9))
for i, ax in enumerate(axes.flat):
ax.imshow(faces.images[i], cmap='binary_r')
ax.set_title(faces.target_names[faces.target[i]], fontsize=10)
ax.set_xticks([]); ax.set_yticks([])
Lets use some dimensionality reduction routines to try and understand the data. Just a warning: you'll probably find that, unlike in the case of the handwritten digits, the projections will be a bit too jumbled to gain much insight. Still, it's always a useful step in understanding your data!
In [7]:
X = faces.data
y = faces.target
In [8]:
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
X_pca = PCA(n_components=2).fit_transform(X)
X_iso = Isomap(n_components=2).fit_transform(X)
In [9]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=faces.target,
cmap='Blues')
plt.title('PCA projection');
In [10]:
plt.scatter(X_iso[:, 0], X_iso[:, 1], c=faces.target,
cmap='Blues')
plt.title('Isomap projection');
It's not obvious from these projections that the data can be well-separated; on the other hand, we've reduced our 1200 dimensional data to two!
Here we'll perform a classification task on our data. Given a training set, we want to build a classifier that will accurately predict the test set
sklearn.cross_validation.train_test_split)sklearn.svm.SVC) to classify the data. Import this and instantiate the estimator.sklearn.metrics.accuracy_score to see how well you're doing.C parameter of SVC. Look at the SVC doc string and try some choices for the kernel, for C and for gamma. What's the best accuracy you can find?sklearn.metrics.classification_report and sklearn.metrics.confusion_matrix, and plot some of the images with the true and predicted label. How well does it do?
In [11]:
# split the data
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(X_train.shape, X_test.shape)
In [12]:
# instantiate the estimator
from sklearn.svm import SVC
clf = SVC()
In [13]:
# Do a fit and check accuracy
from sklearn.metrics import accuracy_score
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
Out[13]:
In [14]:
# Note that we can also do this:
clf.score(X_test, y_test)
Out[14]:
In [15]:
# Try out various hyper parameters
for kernel in ['linear', 'rbf', 'poly']:
clf = SVC(kernel=kernel).fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("{0}: accuracy = {1}".format(kernel, score))
It looks like the linear kernel gives the best results.
In [16]:
best_clf = SVC(kernel='linear').fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=faces.target_names))
In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Out[18]:
In [19]:
shape = faces.images.shape[-2:]
last_names = [label.split()[-1] for label in faces.target_names]
titles = ["True: {0}\nPred: {1}".format(last_names[i_test],
last_names[i_pred])
for (i_test, i_pred) in zip(y_test, y_pred)]
fig, axes = plt.subplots(4, 8, figsize=(12, 9),
subplot_kw=dict(xticks=[], yticks=[]))
for i, ax in enumerate(axes.flat):
ax.imshow(X_test[i].reshape(shape), cmap='binary_r')
ax.set_title(titles[i], fontsize=10)
It still amazes me that with such a simple algorithm, we can get ~80% prediction accuracy on data like this!