In [1]:
# References
# pythonの機械学習ライブラリscikit-learnの紹介
# http://sucrose.hatenablog.com/entry/2013/05/25/133021
# 言語処理のための機械学習入門
# http://nlp.dse.ibaraki.ac.jp/~shinnou/zemi2011/ml4nlp/ml4nlp-rinka-0701.pdf
# Way of Experiment & Evaluation
# http://www.slideshare.net/lanevok/way-of-experiment-evaluation-16623634
In [2]:
import numpy as np
import pandas as pd
In [3]:
from sklearn.datasets import load_iris
# データの読込
iris = load_iris()
X = iris.data
t = iris.target
print(X)
print(t)
In [4]:
# N: データの数, D: データの次元, K: クラスの数
N, D = X.shape
K = np.unique(t).size
print("N = {0}, D = {1}, K = {2}".format(N, D, K))
In [5]:
from sklearn.cross_validation import train_test_split
# データの分割
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)
N_train = X_train.shape[0]
N_test = X_test.shape[0]
print("N_train = {0}, N_test = {1}".format(N_train, N_test))
In [6]:
from sklearn.svm import SVC
# 分類器の生成 (SVM)
clf = SVC(kernel='rbf', C=2.0, gamma=0.45, probability=True)
print(clf)
In [7]:
# 学習
clf.fit(X_train, t_train)
# サポートベクトル
support = clf.support_
support_vectors = X[support]
N_support = support_vectors.shape[0]
print("N_support = {0}".format(N_support))
In [8]:
# 入力データ
inputs = np.array([
[1.1, 2.2, 3.3, 4.4],
[4.1, 3.2, 2.3, 1.4]
])
# 予測
outputs = clf.predict(inputs)
proba = clf.predict_proba(inputs)
data = {
'proba_0': proba[:, 0],
'proba_1': proba[:, 1],
'proba_2': proba[:, 2],
'outputs': outputs
}
columns = ['proba_0', 'proba_1', 'proba_2', 'outputs']
frame = pd.DataFrame(data, columns=columns)
print(frame)
In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_true, y_pred = t_test, clf.predict(X_test)
# 正解率
acc = accuracy_score(y_true, y_pred)
print("Accuracy: {0:.3f}".format(acc))
# 適合率
pre_macro = precision_score(y_true, y_pred, pos_label=None, average='macro')
pre_micro = precision_score(y_true, y_pred, pos_label=None, average='micro')
print("Precision(macro): {0:.3f}".format(pre_macro))
print("Precision(micro): {0:.3f}".format(pre_micro))
# 再現率
rec_macro = recall_score(y_true, y_pred, pos_label=None, average='macro')
rec_micro = recall_score(y_true, y_pred, pos_label=None, average='micro')
print("Recall(macro): {0:.3f}".format(rec_macro))
print("Recall(micro): {0:.3f}".format(rec_micro))
# F 値
f1_macro = f1_score(y_true, y_pred, pos_label=None, average='macro')
f1_micro = f1_score(y_true, y_pred, pos_label=None, average='micro')
print("F-score(macro): {0:.3f}".format(f1_macro))
print("F-score(micro): {0:.3f}".format(f1_micro))
In [10]:
from sklearn.cross_validation import ShuffleSplit
# K-分割交差検定
K = 5
cv = ShuffleSplit(n=N, n_iter=K, test_size=1/K, random_state=0)
scores = []
for train, test in cv:
print
X_train, X_test, t_train, t_test = X[train], X[test], t[train], t[test]
clf = SVC(kernel='rbf', C=2.0, gamma=0.45, probability=True)
clf.fit(X_train, t_train)
score = clf.score(X_test, t_test)
scores.append(score)
score_mean = np.mean(scores)
score_std = np.std(scores)
print("Accuracy: {0:.3f} (+/- {1:.3f})".format(score_mean, 2 * score_std))
In [11]:
from sklearn.cross_validation import cross_val_score
# K-分割交差検定
scores = cross_val_score(clf, X, t, cv=5)
score_mean = np.mean(scores)
score_std = np.std(scores)
print("Accuracy: {0:.3f} (+/- {1:.3f})".format(score_mean, 2 * score_std))
In [12]:
from sklearn.grid_search import GridSearchCV
grid_params = {
'kernel': ['linear', 'rbf'],
'C': [1.0, 2.0, 10.0],
'gamma': [0.02, 0.45, 0.9]
}
# パラメータチューニング
grid_search = GridSearchCV(clf, grid_params, cv=cv, n_jobs=-1)
grid_search.fit(X, t)
best_clf = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("Accuracy: {0:.3f}".format(best_score))
print("Parameters: {0}".format(best_params))
In [13]:
import pickle
# シリアライズ (書込)
pickle.dump(best_clf, open('svm.dat', 'wb'))
In [14]:
# デシリアライズ (読込)
best_clf = pickle.load(open('svm.dat', 'rb'))