簡単な分類問題 iris

植物の「あやめ」の種類を花の大きさ(4変数)から推定する。


In [ ]:
# ライブラリのインポート

from itertools import product
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from IPython.display import Image 
import pydotplus

In [ ]:
# データを読み込む

iris = datasets.load_iris()

# データを見てみる
print("Iris Data")
df = pd.DataFrame(iris.data)
df['target'] = iris.target
print("data = ", iris.feature_names)
print("target = ", iris.target_names)
df

In [ ]:
# データをプロットしてみる

plt.clf()
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(11, 10))

axarr[0, 0].scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target, alpha=0.6)
axarr[0, 0].set_title(iris.feature_names[0] + " vs " + iris.feature_names[1])

axarr[0, 1].scatter(iris.data[:, 1], iris.data[:, 2], c=iris.target, alpha=0.6)
axarr[0, 1].set_title(iris.feature_names[1] + " vs " + iris.feature_names[2])

axarr[1, 0].scatter(iris.data[:, 0], iris.data[:, 3], c=iris.target, alpha=0.6)
axarr[1, 0].set_title(iris.feature_names[0] + " vs " + iris.feature_names[3])

axarr[1, 1].scatter(iris.data[:, 2], iris.data[:, 3], c=iris.target, alpha=0.6)
axarr[1, 1].set_title(iris.feature_names[2] + " vs " + iris.feature_names[3])

plt.show()

scikit-learn準拠の識別器を作る

  • BaseEstimator
  • ClassifierMixin

必要なメソッドだけ定義すると他のsk-learnの識別機と同じように使える。


In [ ]:
from sklearn.base import BaseEstimator, ClassifierMixin

class IrisClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
        
    def fit(self, x, y):
        return self 
    
    def predict_proba(self, x_list):
        return [self.predict_proba_sample(x) for x in x_list]

    def predict(self, x_list):
        proba = self.predict_proba(x_list)
        most_probable_category = np.argmax(proba, axis=1)
        return most_probable_category

    def predict_proba_sample(self, x):
        # e.g.  x = [5.1	3.5	1.4	0.2] should be classified as 0 ([1, 0, 0])

        FILL HERE

精度の評価


In [ ]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## 評価
### 単純化のために2次元の特徴量のみを使う
X = iris.data[:, [0, 1]]
y = iris.target

clf = IrisClassifier()
clf.fit(X, y)

# 予測する
predicted = clf.predict(X)

# 精度(accuracy)
print("Accuracy = ", clf.score(X, y))

世の中の機械学習モデルをいくつか試す

  • SVM
  • DicisionTree
  • KNN
  • 自作

In [ ]:
# 単純化のために2次元の特徴量のみを使う
X = iris.data[:, [0, 1]]
y = iris.target

# 識別器のインスタンスをつくる. SVM
svm = SVC(kernel='rbf', probability=True)

# 学習させる
svm.fit(X, y)

# 学習データに対する精度
print("Score = {0}".format(svm.score(X, y)))

# 最初の10個を予測する
for x in X[:10]:
    print("Predict f(%s) = %s" % (x, svm.predict([x])))

In [ ]:
# 他の識別器
classifiers = [
    SVC(kernel='rbf', probability=True),
    DecisionTreeClassifier(max_depth=2),
    KNeighborsClassifier(n_neighbors=1),
    IrisClassifier(),
    ]

# 同じインタフェース で使える
for classifier in classifiers:
    classifier.fit(X, y)

# Plotting decision regions
plt.clf()
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05), np.arange(y_min, y_max, 0.05))

f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(11, 10))

for index, classifier, title in zip([0, 1, 2, 3], classifiers, ['Kernel SVM', 'Decision Tree (depth=2)', 'KNN (k=1)', 'IrisClassifer']):
    predicted = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    predicted = predicted.reshape(xx.shape)
    
    #     print(predicted)
    axarr[index // 2, index % 2].contourf(xx, yy, predicted, alpha=0.3)
    axarr[index // 2, index % 2].scatter(X[:, 0], X[:, 1], c=y, alpha=0.6)
    axarr[index // 2, index % 2].set_title("%s (%f)" % (title, classifier.score(X, y)))
    
plt.show()

Confusion Matrix

予測したものと正解の数


In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y, clf.predict(X))
plot_confusion_matrix(matrix, ['A', 'B', 'C'], normalize=True)

In [ ]:
## クロスバリデーション
for classifier, title in zip(classifiers, ['Decision Tree (depth=2)', 'KNN (k=1)', 'Kernel SVM', 'Iris Classifier']):
    scores = cross_val_score(classifier, X, y, cv=6)
    print("Cross Validation Score of %s" % title)
    print("mean(%s) = %s" % (scores, scores.mean()))

グリッドサーチで最適なパラーメタを探す

ref. http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [ ]:
## パラメータのグリッドサーチ
from sklearn.model_selection import GridSearchCV

search_params = [{
    'criterion': ['gini', 'entropy'],
    'max_depth': [1],
    FILL HERE
}]

tuned_clf = GridSearchCV(DecisionTreeClassifier(random_state=1), search_params, cv=6)
tuned_clf.fit(iris.data, iris.target)
print("Best Score %s " % tuned_clf.best_score_)
print("Best Params %s " % tuned_clf.best_params_)