In [1]:
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
In [2]:
# 誤分類率を計算する
from scipy.misc import comb
import math
def ensemble_error(n_classifier, error):
k_start = int(math.ceil(n_classifier / 2.0))
# print(k_start) # 6
probs = []
for k in range(k_start, n_classifier + 1): # 6 to 11
# print(k, comb(n_classifier, k))
probs.append(comb(n_classifier, k) * error ** k * (1 - error) ** (n_classifier - k))
#print(probs)
return sum(probs)
ensemble_error(n_classifier=11, error=0.25)
Out[2]:
In [3]:
import numpy as np
error_range = np.arange(0.0, 1.0, 0.01)
ens_errors = [ensemble_error(n_classifier=11, error=error) for error in error_range]
import matplotlib.pyplot as plt
plt.plot(error_range, ens_errors, label='Ensemble error', linewidth=2)
plt.plot(error_range, error_range, label='Base error', linewidth=2, linestyle='--')
plt.xlabel('Base error')
plt.ylabel('Base/Ensemble error')
plt.legend(loc='upper left')
plt.grid()
plt.show()
In [4]:
import numpy as np
# クラスラベルが 0, 0, 1
# 重み係数が 0.2, 0.2 0.6
np.argmax(np.bincount([0, 0, 1], weights=[0.2, 0.2, 0.6]))
Out[4]:
In [5]:
ex = np.array([[0.9, 0.1],
[0.8, 0.2],
[0.4, 0.6]])
# それぞれのクラスに所属する確率
p = np.average(ex, axis=0, weights=[0.2, 0.2, 0.6])
print(p)
# 多数決の結果
print(np.argmax(p))
In [6]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
"""
多数決アンサンブル分類器
パラメータ
----------
classifiers : array-like, shape = [n_classifiers]
アンサンブルの様々な分類器
vote : str, {}'classlabel', 'probability'} {default: 'classlabel}
'classlabel' の場合、クラスラベルの予測はクラスラベルのargmaxに基づく
'probability' の場合、クラスラベルの予測はクラスの所属確率のargmaxに基づく(分類器が調整済であることが推奨される)
weights : array-like, shape = [n_classifiers] (optional, default=None)
`int` または `float` 型の値のリストが提供された場合、分類器は重要度で重み付けされる
`weights=None` の場合は均一な重みを使用
"""
def __init__(self, classifiers, vote='classlabel', weights=None):
self.classifiers = classifiers
self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
self.vote = vote
self.weights = weights
def fit(self, X, y):
"""
分類機を学習させる
パラメータ
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
トレーニングサンプルからなる行列
y : array-like, shape = [n_samples]
クラスラベルのリスト
戻り値
------
self : object
"""
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)
return self
def predict(self, X):
"""
Xのクラスラベルを予測する
"""
if self.vote == 'probability':
maj_vote = np.argmax(self.predict_proba(X), axis=1)
else:
predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x, weights=self.weights)),
axis=1,
arr=predictions
)
maj_vote = self.lablenc_.inverse_transform(maj_vote)
return maj_vote
def predict_proba(self, X):
"""
Xのクラス確率を予測する
"""
probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
arg_proba = np.average(probas, axis=0, weights=self.weights)
return arg_proba
def get_params(self, deep=True):
"""
GridSearchの実行時に分類器のパラメータ名を取得
"""
if not deep:
return super(MajorityVoteClassifier, self).get_params(deep=False)
else:
out = self.named_classifiers.copy()
for name, step in self.named_classifiers.items():
for key, value in step.get_params(deep=True).items():
out['{}_{}'.format(name, key)] = value
return out
In [23]:
predictions = np.asarray([1, 1, 0]).T
print(predictions)
maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x, weights=[0.2, 0.2, 0.6])),
axis=1,
arr=predictions
)
maj_vote
In [ ]:
from sklearn import datasets from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split
iris = datasets.load_iris() X = iris.data[50:, [1, 2]] y = iris.target[50:] le = LabelEncoder() y = le.fit_transform(y)
In [8]:
# 50% のトレーニングデータと 50% のテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,random_state=1)
In [9]:
# ロジスティック回帰、決定木、k近傍をそれぞれつかって分類
# 10分割交差検証を使う
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import cross_val_score
else:
from sklearn.model_selection import cross_val_score
clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=0)
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
pipe1 = Pipeline([['sc', StandardScaler()],
['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
['clf', clf3]])
clfs = [pipe1, clf2, pipe3]
clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']
for clf, label in zip(clfs, clf_labels):
scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
print('ROC AUC: {:.2f} (+/- {:.2f}) [{}]'.format(scores.mean(), scores.std(), label))
In [10]:
mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
clfs.append(mv_clf)
clf_labels.append('Majority Voting')
for clf, label in zip(clfs, clf_labels):
scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
print('ROC AUC: {:.2f} (+/- {:.2f}) [{}]'.format(scores.mean(), scores.std(), label))
In [11]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls in zip(clfs, clf_labels, colors, linestyles):
# 陽性クラスのラベルは1であることが前提
y_pred = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
roc_auc = auc(x=fpr, y=tpr)
plt.plot(fpr, tpr, color=clr, linestyle=ls, label='{} (auc={:.2f})'.format(label, roc_auc))
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.show()
In [12]:
from itertools import product
import numpy as np
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
# 決定領域を描画する最小値、最大値を生成
x_min = X_train_std[:, 0].min() - 1
x_max = X_train_std[:, 0].max() + 1
y_min = X_train_std[:, 1].min() - 1
y_max = X_train_std[:, 1].max() + 1
print(x_min, x_max, y_min, y_max)
# グリッドポイントを生成
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
# 描画領域を2行2列に分割
f, axarr = plt.subplots(nrows=2, ncols=2, sharex='col', sharey='row', figsize=(7, 5))
# 決定領域のプロット、青や赤の散布図の作成などを実行
# 変数 idx は各分類器を描画する行と列の位置を表すタプル
for idx, clf, tt in zip(product([0, 1], [0, 1]), clfs, clf_labels):
clf.fit(X_train_std, y_train)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 描画対象のグラフ
ax = axarr[idx[0], idx[1]]
ax.contourf(xx, yy, Z, alpha=0.3)
ax.scatter(X_train_std[y_train==0, 0], X_train_std[y_train==0, 1], c='blue', marker='^', s=50)
ax.scatter(X_train_std[y_train==1, 0], X_train_std[y_train==1, 1], c='red', marker='o', s=50)
ax.set_title(tt)
plt.text(-3.5, -4.5, s='Sepal width [standardized]', ha='center', va='center', fontsize=12)
plt.text(-11.5, 4.5, s='Petal length [standardized]', ha='center', va='center', fontsize=12, rotation=90)
plt.show()
In [13]:
# パラメータの一覧
mv_clf.get_params()
Out[13]:
In [14]:
# ロジスティック回帰分類器の逆正則化パラメータCと決定木の深さをチューニング
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import GridSearchCV
else:
from sklearn.model_selection import GridSearchCV
params = {'decisiontreeclassifier_max_depth': [1, 2],
'pipeline-1_clf__C': [0.001, 0.1, 100.0]}
grid = GridSearchCV(estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)
Out[14]:
In [15]:
# ハイパーパラメータ値の組み合わせとROC曲線の平均値を出力
# scikit-learn 0.18 以前
#for params, mean_score, scores in grid.grid_scores_:
# print('{:.3f}+/-{:.2f} {}'.format(mean_score, scores.std() / 2 , params))
# scikit-learn 0.18 以上の場合
cv_keys = ('mean_test_score', 'std_test_score','params')
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
print('{:.3f}+/-{:.2f} {}'.format(grid.cv_results_[cv_keys[0]][r],
grid.cv_results_[cv_keys[1]][r] / 2.0,
grid.cv_results_[cv_keys[2]][r]))
# 最も良いスコアを出したパラメーターとスコアを出力
print('Best parameters: {}'.format(grid.best_params_))
print('Accuracy: {:.2f}'.format(grid.best_score_))
In [16]:
import pandas as pd
# ワインのデータを読み込む
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']
# クラス2とクラス3のみ
df_wine = df_wine[df_wine['Class label'] != 1]
y = df_wine['Class label'].values
# 2つの特徴量(Alcohol, Hue)を選択
X = df_wine[['Alcohol', 'Hue']].values
df_wine.head()
Out[16]:
In [17]:
from sklearn.preprocessing import LabelEncoder
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
le = LabelEncoder()
y = le.fit_transform(y)
# 60%のトレーニングデータセットと40%のテストデータセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=1)
print(len(X_train), len(X_test))
In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)
# 500この決定木からなるアンサンブルを作成し、トレーニングデータセットの異なるブートストラップ標本で学習する
bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0,
bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1)
bag
Out[18]:
In [19]:
# 普通の決定木での性能
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies {:.3f}/{:.3f}'.format(tree_train, tree_test))
In [20]:
# バギングでの性能
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging traain/test accuracies {:.3f}/{:.3f}'.format(bag_train, bag_test))
In [21]:
# 決定木とバギング分類器の決定領域を比較
import numpy as np
import matplotlib.pyplot as plt
# 決定領域を描画する最小値、最大値を生成
x_min = X_train[:, 0].min() - 1
x_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1
print(x_min, x_max, y_min, y_max)
# グリッドポイントを生成
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
# 描画領域を2列に分割
f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3))
for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision Tree', 'Bagging']):
clf.fit(X_train, y_train)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 描画対象のグラフ
ax = axarr[idx]
ax.contourf(xx, yy, Z, alpha=0.3)
ax.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', marker='^')
ax.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', marker='o')
ax.set_title(tt)
axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -1.2, s='Hue', ha='center', va='center', fontsize=12)
plt.show()
In [24]:
from sklearn.ensemble import AdaBoostClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=0)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies {:.3f}/{:.3f}'.format(tree_train, tree_test))
In [25]:
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies {:.3f}/{:.3f}'.format(ada_train, ada_test))
In [26]:
# 決定木とアダブースト分類器の決定領域を比較
import numpy as np
import matplotlib.pyplot as plt
# 決定領域を描画する最小値、最大値を生成
x_min = X_train[:, 0].min() - 1
x_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1
print(x_min, x_max, y_min, y_max)
# グリッドポイントを生成
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
# 描画領域を2列に分割
f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3))
for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision Tree', 'AdaBoost']):
clf.fit(X_train, y_train)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 描画対象のグラフ
ax = axarr[idx]
ax.contourf(xx, yy, Z, alpha=0.3)
ax.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', marker='^')
ax.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', marker='o')
ax.set_title(tt)
axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -1.2, s='Hue', ha='center', va='center', fontsize=12)
plt.show()