In [1]:
# References
# pythonの機械学習ライブラリscikit-learnの紹介
# http://sucrose.hatenablog.com/entry/2013/05/25/133021
# 言語処理のための機械学習入門
# http://nlp.dse.ibaraki.ac.jp/~shinnou/zemi2011/ml4nlp/ml4nlp-rinka-0701.pdf
# Way of Experiment & Evaluation
# http://www.slideshare.net/lanevok/way-of-experiment-evaluation-16623634

In [2]:
import numpy  as np
import pandas as pd

In [3]:
from sklearn.datasets import load_iris

# データの読込
iris = load_iris()
X    = iris.data
t    = iris.target

print(X)
print(t)


[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.4  3.7  1.5  0.2]
 [ 4.8  3.4  1.6  0.2]
 [ 4.8  3.   1.4  0.1]
 [ 4.3  3.   1.1  0.1]
 [ 5.8  4.   1.2  0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 5.4  3.9  1.3  0.4]
 [ 5.1  3.5  1.4  0.3]
 [ 5.7  3.8  1.7  0.3]
 [ 5.1  3.8  1.5  0.3]
 [ 5.4  3.4  1.7  0.2]
 [ 5.1  3.7  1.5  0.4]
 [ 4.6  3.6  1.   0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 4.8  3.4  1.9  0.2]
 [ 5.   3.   1.6  0.2]
 [ 5.   3.4  1.6  0.4]
 [ 5.2  3.5  1.5  0.2]
 [ 5.2  3.4  1.4  0.2]
 [ 4.7  3.2  1.6  0.2]
 [ 4.8  3.1  1.6  0.2]
 [ 5.4  3.4  1.5  0.4]
 [ 5.2  4.1  1.5  0.1]
 [ 5.5  4.2  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.   3.2  1.2  0.2]
 [ 5.5  3.5  1.3  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 4.4  3.   1.3  0.2]
 [ 5.1  3.4  1.5  0.2]
 [ 5.   3.5  1.3  0.3]
 [ 4.5  2.3  1.3  0.3]
 [ 4.4  3.2  1.3  0.2]
 [ 5.   3.5  1.6  0.6]
 [ 5.1  3.8  1.9  0.4]
 [ 4.8  3.   1.4  0.3]
 [ 5.1  3.8  1.6  0.2]
 [ 4.6  3.2  1.4  0.2]
 [ 5.3  3.7  1.5  0.2]
 [ 5.   3.3  1.4  0.2]
 [ 7.   3.2  4.7  1.4]
 [ 6.4  3.2  4.5  1.5]
 [ 6.9  3.1  4.9  1.5]
 [ 5.5  2.3  4.   1.3]
 [ 6.5  2.8  4.6  1.5]
 [ 5.7  2.8  4.5  1.3]
 [ 6.3  3.3  4.7  1.6]
 [ 4.9  2.4  3.3  1. ]
 [ 6.6  2.9  4.6  1.3]
 [ 5.2  2.7  3.9  1.4]
 [ 5.   2.   3.5  1. ]
 [ 5.9  3.   4.2  1.5]
 [ 6.   2.2  4.   1. ]
 [ 6.1  2.9  4.7  1.4]
 [ 5.6  2.9  3.6  1.3]
 [ 6.7  3.1  4.4  1.4]
 [ 5.6  3.   4.5  1.5]
 [ 5.8  2.7  4.1  1. ]
 [ 6.2  2.2  4.5  1.5]
 [ 5.6  2.5  3.9  1.1]
 [ 5.9  3.2  4.8  1.8]
 [ 6.1  2.8  4.   1.3]
 [ 6.3  2.5  4.9  1.5]
 [ 6.1  2.8  4.7  1.2]
 [ 6.4  2.9  4.3  1.3]
 [ 6.6  3.   4.4  1.4]
 [ 6.8  2.8  4.8  1.4]
 [ 6.7  3.   5.   1.7]
 [ 6.   2.9  4.5  1.5]
 [ 5.7  2.6  3.5  1. ]
 [ 5.5  2.4  3.8  1.1]
 [ 5.5  2.4  3.7  1. ]
 [ 5.8  2.7  3.9  1.2]
 [ 6.   2.7  5.1  1.6]
 [ 5.4  3.   4.5  1.5]
 [ 6.   3.4  4.5  1.6]
 [ 6.7  3.1  4.7  1.5]
 [ 6.3  2.3  4.4  1.3]
 [ 5.6  3.   4.1  1.3]
 [ 5.5  2.5  4.   1.3]
 [ 5.5  2.6  4.4  1.2]
 [ 6.1  3.   4.6  1.4]
 [ 5.8  2.6  4.   1.2]
 [ 5.   2.3  3.3  1. ]
 [ 5.6  2.7  4.2  1.3]
 [ 5.7  3.   4.2  1.2]
 [ 5.7  2.9  4.2  1.3]
 [ 6.2  2.9  4.3  1.3]
 [ 5.1  2.5  3.   1.1]
 [ 5.7  2.8  4.1  1.3]
 [ 6.3  3.3  6.   2.5]
 [ 5.8  2.7  5.1  1.9]
 [ 7.1  3.   5.9  2.1]
 [ 6.3  2.9  5.6  1.8]
 [ 6.5  3.   5.8  2.2]
 [ 7.6  3.   6.6  2.1]
 [ 4.9  2.5  4.5  1.7]
 [ 7.3  2.9  6.3  1.8]
 [ 6.7  2.5  5.8  1.8]
 [ 7.2  3.6  6.1  2.5]
 [ 6.5  3.2  5.1  2. ]
 [ 6.4  2.7  5.3  1.9]
 [ 6.8  3.   5.5  2.1]
 [ 5.7  2.5  5.   2. ]
 [ 5.8  2.8  5.1  2.4]
 [ 6.4  3.2  5.3  2.3]
 [ 6.5  3.   5.5  1.8]
 [ 7.7  3.8  6.7  2.2]
 [ 7.7  2.6  6.9  2.3]
 [ 6.   2.2  5.   1.5]
 [ 6.9  3.2  5.7  2.3]
 [ 5.6  2.8  4.9  2. ]
 [ 7.7  2.8  6.7  2. ]
 [ 6.3  2.7  4.9  1.8]
 [ 6.7  3.3  5.7  2.1]
 [ 7.2  3.2  6.   1.8]
 [ 6.2  2.8  4.8  1.8]
 [ 6.1  3.   4.9  1.8]
 [ 6.4  2.8  5.6  2.1]
 [ 7.2  3.   5.8  1.6]
 [ 7.4  2.8  6.1  1.9]
 [ 7.9  3.8  6.4  2. ]
 [ 6.4  2.8  5.6  2.2]
 [ 6.3  2.8  5.1  1.5]
 [ 6.1  2.6  5.6  1.4]
 [ 7.7  3.   6.1  2.3]
 [ 6.3  3.4  5.6  2.4]
 [ 6.4  3.1  5.5  1.8]
 [ 6.   3.   4.8  1.8]
 [ 6.9  3.1  5.4  2.1]
 [ 6.7  3.1  5.6  2.4]
 [ 6.9  3.1  5.1  2.3]
 [ 5.8  2.7  5.1  1.9]
 [ 6.8  3.2  5.9  2.3]
 [ 6.7  3.3  5.7  2.5]
 [ 6.7  3.   5.2  2.3]
 [ 6.3  2.5  5.   1.9]
 [ 6.5  3.   5.2  2. ]
 [ 6.2  3.4  5.4  2.3]
 [ 5.9  3.   5.1  1.8]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

In [4]:
# N: データの数, D: データの次元, K: クラスの数
N, D = X.shape
K    = np.unique(t).size

print("N = {0}, D = {1}, K = {2}".format(N, D, K))


N = 150, D = 4, K = 3

In [5]:
from sklearn.cross_validation import train_test_split

# データの分割
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)

N_train = X_train.shape[0]
N_test  = X_test.shape[0]

print("N_train = {0}, N_test = {1}".format(N_train, N_test))


N_train = 120, N_test = 30

In [6]:
from sklearn.svm import SVC

# 分類器の生成 (SVM)
clf = SVC(kernel='rbf', C=2.0, gamma=0.45, probability=True)

print(clf)


SVC(C=2.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.45, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
# 学習
clf.fit(X_train, t_train)

# サポートベクトル
support         = clf.support_
support_vectors = X[support]
N_support       = support_vectors.shape[0]

print("N_support = {0}".format(N_support))


N_support = 35

In [8]:
# 入力データ
inputs  = np.array([
    [1.1, 2.2, 3.3, 4.4],
    [4.1, 3.2, 2.3, 1.4]
])

# 予測
outputs = clf.predict(inputs)
proba   = clf.predict_proba(inputs)

data    = {
    'proba_0': proba[:, 0],
    'proba_1': proba[:, 1],
    'proba_2': proba[:, 2],
    'outputs': outputs
}
columns = ['proba_0', 'proba_1', 'proba_2', 'outputs']
frame   = pd.DataFrame(data, columns=columns)

print(frame)


    proba_0   proba_1   proba_2  outputs
0  0.215593  0.318436  0.465972        2
1  0.352462  0.370386  0.277152        1

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

y_true, y_pred = t_test, clf.predict(X_test)

# 正解率
acc = accuracy_score(y_true, y_pred)
print("Accuracy: {0:.3f}".format(acc))

# 適合率
pre_macro = precision_score(y_true, y_pred, pos_label=None, average='macro')
pre_micro = precision_score(y_true, y_pred, pos_label=None, average='micro')
print("Precision(macro): {0:.3f}".format(pre_macro))
print("Precision(micro): {0:.3f}".format(pre_micro))

# 再現率
rec_macro = recall_score(y_true, y_pred, pos_label=None, average='macro')
rec_micro = recall_score(y_true, y_pred, pos_label=None, average='micro')
print("Recall(macro): {0:.3f}".format(rec_macro))
print("Recall(micro): {0:.3f}".format(rec_micro))

# F 値
f1_macro  = f1_score(y_true, y_pred, pos_label=None, average='macro')
f1_micro  = f1_score(y_true, y_pred, pos_label=None, average='micro')
print("F-score(macro): {0:.3f}".format(f1_macro))
print("F-score(micro): {0:.3f}".format(f1_micro))


Accuracy: 1.000
Precision(macro): 1.000
Precision(micro): 1.000
Recall(macro): 1.000
Recall(micro): 1.000
F-score(macro): 1.000
F-score(micro): 1.000

In [10]:
from sklearn.cross_validation import ShuffleSplit

# K-分割交差検定
K          = 5
cv         = ShuffleSplit(n=N, n_iter=K, test_size=1/K, random_state=0)
scores     = []

for train, test in cv:
    print
    X_train, X_test, t_train, t_test = X[train], X[test], t[train], t[test]
    clf    = SVC(kernel='rbf', C=2.0, gamma=0.45, probability=True)
    clf.fit(X_train, t_train)
    score  = clf.score(X_test, t_test)
    
    scores.append(score)

score_mean = np.mean(scores)
score_std     = np.std(scores)

print("Accuracy: {0:.3f} (+/- {1:.3f})".format(score_mean, 2 * score_std))


ShuffleSplit(150, n_iter=5, test_size=0.2, random_state=0)
Accuracy: 0.973 (+/- 0.027)

In [11]:
from sklearn.cross_validation import cross_val_score

# K-分割交差検定
scores           = cross_val_score(clf, X, t, cv=5)
score_mean = np.mean(scores)
score_std     = np.std(scores)

print("Accuracy: {0:.3f} (+/- {1:.3f})".format(score_mean, 2 * score_std))


Accuracy: 0.973 (+/- 0.050)

In [12]:
from sklearn.grid_search import GridSearchCV

grid_params    = {
    'kernel': ['linear', 'rbf'],
    'C':      [1.0, 2.0, 10.0],
    'gamma':  [0.02, 0.45, 0.9]
}

# パラメータチューニング 
grid_search    = GridSearchCV(clf, grid_params, cv=cv, n_jobs=-1)
grid_search.fit(X, t)
best_clf       = grid_search.best_estimator_
best_score     = grid_search.best_score_
best_params    = grid_search.best_params_

print("Accuracy: {0:.3f}".format(best_score))
print("Parameters: {0}".format(best_params))


Accuracy: 0.993
Parameters: {'kernel': 'linear', 'gamma': 0.02, 'C': 1.0}

In [13]:
import pickle

# シリアライズ (書込)
pickle.dump(best_clf, open('svm.dat', 'wb'))

In [14]:
# デシリアライズ (読込)
best_clf = pickle.load(open('svm.dat', 'rb'))